IJG R6b with x86SIMD V1.02

Independent JPEG Group's JPEG software release 6b with x86 SIMD extension for IJG JPEG library version 1.02
2006-02-04 00:00:00 +00:00
parent 5ead57a34a
commit a2e6a9dd47
156 changed files with 49018 additions and 4283 deletions
--- a/aclocal.m4
+++ b/aclocal.m4
--- a/altui/README.alt
+++ b/altui/README.alt
@@ -0,0 +1,71 @@
+Here is an alternate command-line user interface for the IJG JPEG software.
+It is designed for use under MS-DOS, and may also be useful on other non-Unix
+operating systems.  (For that matter, this code works fine on Unix, but the
+standard command-line syntax is better on Unix because it is pipe-friendly.)
+
+With this user interface, cjpeg and djpeg accept multiple input file names
+on the command line; output file names are generated by substituting
+appropriate extensions.  The user is prompted before any already-existing
+file will be overwritten.  See usage.alt for details.
+
+Expansion of wild-card file specifications is useful but is not directly
+provided by this code.  Most DOS C compilers have the ability to do wild-card
+expansion "behind the scenes", and we rely on that feature.  On other systems,
+the shell may do it for you, as is done on Unix.
+
+Also, a DOS-specific routine is provided to determine available memory;
+this makes the -maxmemory switch unnecessary except in unusual cases.
+If you know how to determine available memory on a different system,
+you can easily add the necessary code.  (And please send it along to
+jpeg-info@uunet.uu.net so we can include it in future releases!)
+
+
+INSTALLATION
+============
+
+You need to have the main IJG JPEG distribution, release 6 or later.
+Replace the standard cjpeg.c and djpeg.c files with the ones provided here.
+Then build the software as described in the main distribution's install.doc
+file, with these exceptions:
+
+* Define PROGRESS_REPORT in jconfig.h if you want the percent-done display.
+* Define NO_OVERWRITE_CHECK if you *don't* want overwrite confirmation.
+* You may ignore the USE_SETMODE and TWO_FILE_COMMANDLINE symbols discussed
+  in install.doc; these files do not use them.
+* As given, djpeg.c defaults to GIF output (not PPM output as in the standard
+  djpeg.c).  If you want something different, modify DEFAULT_FMT.
+
+You may also need to do something special to enable filename wild-card
+expansion, assuming your compiler has that capability at all.
+
+Modify the standard usage.doc file as described in usage.alt.  (If you want
+to use the Unix-style manual pages cjpeg.1 and djpeg.1, better fix them too.)
+
+
+Here are some specific notes for popular MS-DOS compilers:
+
+Borland C:
+  Add "-DMSDOS" to CFLAGS to enable use of the DOS memory determination code.
+  Link with the standard library file WILDARGS.OBJ to get wild-card expansion.
+
+Microsoft C:
+  Add "-DMSDOS" to CFLAGS to enable use of the DOS memory determination code.
+  Link with the standard library file SETARGV.OBJ to get wild-card expansion.
+  In the versions I've used, you must also add /NOE to the linker switches to
+  avoid a duplicate-symbol error from including SETARGV.
+
+DJGPP (we recommend version 2.0 or later):
+  Add "-DFREE_MEM_ESTIMATE=0" to CFLAGS.  Wild-card expansion is automatic.
+
+
+LEGAL ISSUES
+============
+
+This software is copyright (C) 1991-1998, Thomas G. Lane.
+Terms of distribution and use are the same as for the free IJG JPEG software;
+see its README file for details.
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
--- a/altui/cjpeg.c
+++ b/altui/cjpeg.c
@@ -0,0 +1,813 @@
+/*
+ * alternate cjpeg.c
+ *
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 6, 2006
+ * ---------------------------------------------------------------------
+ *
+ * This file contains an alternate user interface for the JPEG compressor.
+ * One or more input files are named on the command line, and output file
+ * names are created by substituting ".jpg" for the input file's extension.
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"		/* for version message */
+
+#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef __MWERKS__
+#include <SIOUX.h>              /* Metrowerks needs this */
+#include <console.h>		/* ... and this */
+#endif
+#ifdef THINK_C
+#include <console.h>		/* Think declares it here */
+#endif
+#endif
+
+#ifndef PATH_MAX		/* ANSI maximum-pathname-length constant */
+#define PATH_MAX 256
+#endif
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code,string)	string ,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * SIMD Ext: compiler-specific hacks to enable filename wild-card expansion
+ */
+
+#ifdef _MSC_VER		/* Microsoft Visual C++ */
+/* from setargv.c (setargv.obj) */
+/* Tested under Visual C++ V6.0, Toolkit 2003, and 2005 Express Edition */
+int __cdecl _setargv(void) { int __cdecl __setargv(void); return __setargv(); }
+#endif
+#ifdef __BORLANDC__	/* Borland C++ */
+/* from wildargs.c (wildargs.obj) */
+/* Tested under Borland C++ Compiler 5.5 (win32) */
+#include <wildargs.h>
+typedef void _RTLENTRY (* _RTLENTRY _argv_expand_fnc)(char *, _PFN_ADDARG);
+_argv_expand_fnc _argv_expand_ptr = _expand_wild;
+#endif
+
+
+/*
+ * Automatic determination of available memory.
+ */
+
+static long default_maxmem;	/* saves value determined at startup, or 0 */
+
+#ifndef FREE_MEM_ESTIMATE	/* may be defined from command line */
+
+#ifdef MSDOS			/* For MS-DOS (unless flat-memory model) */
+
+#include <dos.h>		/* for access to intdos() call */
+
+LOCAL(long)
+unused_dos_memory (void)
+/* Obtain total amount of unallocated DOS memory */
+{
+  union REGS regs;
+  long nparas;
+
+  regs.h.ah = 0x48;		/* DOS function Allocate Memory Block */
+  regs.x.bx = 0xFFFF;		/* Ask for more memory than DOS can have */
+  (void) intdos(&regs, &regs);
+  /* DOS will fail and return # of paragraphs actually available in BX. */
+  nparas = (unsigned int) regs.x.bx;
+  /* Times 16 to convert to bytes. */
+  return nparas << 4;
+}
+
+/* The default memory setting is 95% of the available space. */
+#define FREE_MEM_ESTIMATE  ((unused_dos_memory() * 95L) / 100L)
+
+#endif /* MSDOS */
+
+#ifdef ATARI			/* For Atari ST/STE/TT, Pure C or Turbo C */
+
+#include <ext.h>
+
+/* The default memory setting is 90% of the available space. */
+#define FREE_MEM_ESTIMATE  (((long) coreleft() * 90L) / 100L)
+
+#endif /* ATARI */
+
+/* Add memory-estimation procedures for other operating systems here,
+ * with appropriate #ifdef's around them.
+ */
+
+#endif /* !FREE_MEM_ESTIMATE */
+
+
+/*
+ * This routine determines what format the input file is,
+ * and selects the appropriate input-reading module.
+ *
+ * To determine which family of input formats the file belongs to,
+ * we may look only at the first byte of the file, since C does not
+ * guarantee that more than one character can be pushed back with ungetc.
+ * Looking at additional bytes would require one of these approaches:
+ *     1) assume we can fseek() the input file (fails for piped input);
+ *     2) assume we can push back more than one character (works in
+ *        some C implementations, but unportable);
+ *     3) provide our own buffering (breaks input readers that want to use
+ *        stdio directly, such as the RLE library);
+ * or  4) don't put back the data, and modify the input_init methods to assume
+ *        they start reading after the start of file (also breaks RLE library).
+ * #1 is attractive for MS-DOS but is untenable on Unix.
+ *
+ * The most portable solution for file types that can't be identified by their
+ * first byte is to make the user tell us what they are.  This is also the
+ * only approach for "raw" file types that contain only arbitrary values.
+ * We presently apply this method for Targa files.  Most of the time Targa
+ * files start with 0x00, so we recognize that case.  Potentially, however,
+ * a Targa file could start with any byte value (byte 0 is the length of the
+ * seldom-used ID field), so we provide a switch to force Targa input mode.
+ */
+
+static boolean is_targa;	/* records user -targa switch */
+
+
+LOCAL(cjpeg_source_ptr)
+select_file_type (j_compress_ptr cinfo, FILE * infile)
+{
+  int c;
+
+  if (is_targa) {
+#ifdef TARGA_SUPPORTED
+    return jinit_read_targa(cinfo);
+#else
+    ERREXIT(cinfo, JERR_TGA_NOTCOMP);
+#endif
+  }
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+  if (ungetc(c, infile) == EOF)
+    ERREXIT(cinfo, JERR_UNGETC_FAILED);
+
+  switch (c) {
+#ifdef BMP_SUPPORTED
+  case 'B':
+    return jinit_read_bmp(cinfo);
+#endif
+#ifdef GIF_SUPPORTED
+  case 'G':
+    return jinit_read_gif(cinfo);
+#endif
+#ifdef PPM_SUPPORTED
+  case 'P':
+    return jinit_read_ppm(cinfo);
+#endif
+#ifdef RLE_SUPPORTED
+  case 'R':
+    return jinit_read_rle(cinfo);
+#endif
+#ifdef TARGA_SUPPORTED
+  case 0x00:
+    return jinit_read_targa(cinfo);
+#endif
+  default:
+    ERREXIT(cinfo, JERR_UNKNOWN_FORMAT);
+    break;
+  }
+
+  return NULL;			/* suppress compiler warnings */
+}
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ */
+
+
+static const char * progname;	/* program name for error messages */
+static char * outfilename;	/* for -outfile switch */
+
+
+LOCAL(void)
+usage (void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] inputfile(s)\n", progname);
+  fprintf(stderr, "List of input files may use wildcards (* and ?)\n");
+  fprintf(stderr, "Output filename is same as input filename, but extension .jpg\n");
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -quality N     Compression quality (0..100; 5-95 is useful range)\n");
+  fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
+#ifdef ENTROPY_OPT_SUPPORTED
+  fprintf(stderr, "  -optimize      Optimize Huffman table (smaller file, but slow compression)\n");
+#endif
+#ifdef C_PROGRESSIVE_SUPPORTED
+  fprintf(stderr, "  -progressive   Create progressive JPEG file\n");
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
+	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
+#ifdef INPUT_SMOOTHING_SUPPORTED
+  fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
+#endif
+#ifndef FREE_MEM_ESTIMATE
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+#endif
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "Switches for wizards:\n");
+#ifdef C_ARITH_CODING_SUPPORTED
+  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
+#endif
+  fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
+  fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
+  fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
+  fprintf(stderr, "  -sample HxV[,...]  Set component sampling factors\n");
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+  fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
+#endif
+  exit(EXIT_FAILURE);
+}
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
+LOCAL(int)
+parse_switches (j_compress_ptr cinfo, int argc, char **argv,
+		int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char * arg;
+  int quality;			/* -quality parameter */
+  int q_scale_factor;		/* scaling percentage for -qtables */
+  boolean force_baseline;
+  boolean simple_progressive;
+  char * qtablefile = NULL;	/* saves -qtables filename if any */
+  char * qslotsarg = NULL;	/* saves -qslots parm if any */
+  char * samplearg = NULL;	/* saves -sample parm if any */
+  char * scansarg = NULL;	/* saves -scans parm if any */
+
+  /* Set up default JPEG parameters. */
+  /* Note that default -quality level need not, and does not,
+   * match the default scaling for an explicit -qtables argument.
+   */
+  quality = 75;			/* default -quality value */
+  q_scale_factor = 100;		/* default to no scaling for -qtables */
+  force_baseline = FALSE;	/* by default, allow 16-bit quantizers */
+  simple_progressive = FALSE;
+  is_targa = FALSE;
+  outfilename = NULL;
+  cinfo->err->trace_level = 0;
+  if (default_maxmem > 0)	/* override library's default value */
+    cinfo->mem->max_memory_to_use = default_maxmem;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+	outfilename = NULL;	/* -outfile applies to just one input file */
+	continue;		/* ignore this name if previously processed */
+      }
+      break;			/* else done parsing switches */
+    }
+    arg++;			/* advance past switch marker character */
+
+    if (keymatch(arg, "arithmetic", 1)) {
+      /* Use arithmetic coding. */
+#ifdef C_ARITH_CODING_SUPPORTED
+      cinfo->arith_code = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "baseline", 1)) {
+      /* Force baseline-compatible output (8-bit quantizer values). */
+      force_baseline = TRUE;
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select DCT algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "int", 1)) {
+	cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+	cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+	cinfo->dct_method = JDCT_FLOAT;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (! printed_version) {
+	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
+		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_forward_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_forward_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_forward_dct(cinfo, JDCT_FLOAT));
+#endif
+	print_simd_info(stderr, "Downsampling (-sample 2x2 or 2x1)  :",
+			jpeg_simd_downsampler(cinfo));
+	print_simd_info(stderr, "Colorspace conversion (RGB->YCbCr) :",
+			jpeg_simd_color_converter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+	printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+      /* Force a monochrome JPEG file to be generated. */
+      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (ch == 'm' || ch == 'M')
+	lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
+      /* Enable entropy parm optimization. */
+#ifdef ENTROPY_OPT_SUPPORTED
+      cinfo->optimize_coding = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      outfilename = argv[argn];	/* save it away for later use */
+
+    } else if (keymatch(arg, "progressive", 1)) {
+      /* Select simple progressive mode. */
+#ifdef C_PROGRESSIVE_SUPPORTED
+      simple_progressive = TRUE;
+      /* We must postpone execution until num_components is known. */
+#else
+      fprintf(stderr, "%s: sorry, progressive output was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "quality", 1)) {
+      /* Quality factor (quantization table scaling factor). */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &quality) != 1)
+	usage();
+      /* Change scale factor in case -qtables is present. */
+      q_scale_factor = jpeg_quality_scaling(quality);
+
+    } else if (keymatch(arg, "qslots", 2)) {
+      /* Quantization table slot numbers. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      qslotsarg = argv[argn];
+      /* Must delay setting qslots until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default quant table numbers.
+       */
+
+    } else if (keymatch(arg, "qtables", 2)) {
+      /* Quantization tables fetched from file. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      qtablefile = argv[argn];
+      /* We postpone actually reading the file in case -quality comes later. */
+
+    } else if (keymatch(arg, "restart", 1)) {
+      /* Restart interval in MCU rows (or in MCUs with 'b'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (lval < 0 || lval > 65535L)
+	usage();
+      if (ch == 'b' || ch == 'B') {
+	cinfo->restart_interval = (unsigned int) lval;
+	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
+      } else {
+	cinfo->restart_in_rows = (int) lval;
+	/* restart_interval will be computed during startup */
+      }
+
+    } else if (keymatch(arg, "sample", 2)) {
+      /* Set sampling factors. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      samplearg = argv[argn];
+      /* Must delay setting sample factors until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default sampling factors.
+       */
+
+    } else if (keymatch(arg, "scans", 2)) {
+      /* Set scan script. */
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      scansarg = argv[argn];
+      /* We must postpone reading the file in case -progressive appears. */
+#else
+      fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "smooth", 2)) {
+      /* Set input smoothing factor. */
+      int val;
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+	usage();
+      if (val < 0 || val > 100)
+	usage();
+      cinfo->smoothing_factor = val;
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Input file is Targa format. */
+      is_targa = TRUE;
+
+    } else {
+      usage();			/* bogus switch */
+    }
+  }
+
+  /* Post-switch-scanning cleanup */
+
+  if (for_real) {
+
+    /* Set quantization tables for selected quality. */
+    /* Some or all may be overridden if -qtables is present. */
+    jpeg_set_quality(cinfo, quality, force_baseline);
+
+    if (qtablefile != NULL)	/* process -qtables if it was present */
+      if (! read_quant_tables(cinfo, qtablefile,
+			      q_scale_factor, force_baseline))
+	usage();
+
+    if (qslotsarg != NULL)	/* process -qslots if it was present */
+      if (! set_quant_slots(cinfo, qslotsarg))
+	usage();
+
+    if (samplearg != NULL)	/* process -sample if it was present */
+      if (! set_sample_factors(cinfo, samplearg))
+	usage();
+
+#ifdef C_PROGRESSIVE_SUPPORTED
+    if (simple_progressive)	/* process -progressive; -scans can override */
+      jpeg_simple_progression(cinfo);
+#endif
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+    if (scansarg != NULL)	/* process -scans if it was present */
+      if (! read_scan_script(cinfo, scansarg))
+	usage();
+#endif
+  }
+
+  return argn;			/* return index of next arg (file name) */
+}
+
+
+/*
+ * Check for overwrite of an existing file; clear it with user
+ */
+
+#ifndef NO_OVERWRITE_CHECK
+
+LOCAL(boolean)
+is_write_ok (char * outfname)
+{
+  FILE * ofile;
+  int ch;
+
+  ofile = fopen(outfname, READ_BINARY);
+  if (ofile == NULL)
+    return TRUE;		/* not present */
+  fclose(ofile);		/* oops, it is present */
+
+  for (;;) {
+    fprintf(stderr, "%s already exists, overwrite it? [y/n] ",
+	    outfname);
+    fflush(stderr);
+    ch = getc(stdin);
+    if (ch != '\n')		/* flush rest of line */
+      while (getc(stdin) != '\n')
+	/* nothing */;
+
+    switch (ch) {
+    case 'Y':
+    case 'y':
+      return TRUE;
+    case 'N':
+    case 'n':
+      return FALSE;
+    /* otherwise, ask again */
+    }
+  }
+}
+
+#endif
+
+
+/*
+ * Process a single input file name, and return its index in argv[].
+ * File names at or to left of old_file_index have been processed already.
+ */
+
+LOCAL(int)
+process_one_file (int argc, char **argv, int old_file_index)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  char *infilename;
+  char workfilename[PATH_MAX];
+#ifdef PROGRESS_REPORT
+  struct cdjpeg_progress_mgr progress;
+#endif
+  int file_index;
+  cjpeg_source_ptr src_mgr;
+  FILE * input_file = NULL;
+  FILE * output_file = NULL;
+  JDIMENSION num_scanlines;
+
+  /* Initialize the JPEG compression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Now safe to enable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) &cinfo);
+#endif
+
+  /* Initialize JPEG parameters.
+   * Much of this may be overridden later.
+   * In particular, we don't yet know the input file's color space,
+   * but we need to provide some value for jpeg_set_defaults() to work.
+   */
+
+  cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
+  jpeg_set_defaults(&cinfo);
+
+  /* Scan command line to find next file name.
+   * It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, FALSE);
+  if (file_index >= argc) {
+    fprintf(stderr, "%s: missing input file name\n", progname);
+    usage();
+  }
+
+  /* Open the input file. */
+  infilename = argv[file_index];
+  if ((input_file = fopen(infilename, READ_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't open %s\n", progname, infilename);
+    goto fail;
+  }
+
+#ifdef PROGRESS_REPORT
+  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+#endif
+
+  /* Figure out the input file format, and set up to read it. */
+  src_mgr = select_file_type(&cinfo, input_file);
+  src_mgr->input_file = input_file;
+
+  /* Read the input file header to obtain file size & colorspace. */
+  (*src_mgr->start_input) (&cinfo, src_mgr);
+
+  /* Now that we know input colorspace, fix colorspace-dependent defaults */
+  jpeg_default_colorspace(&cinfo);
+
+  /* Adjust default compression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, TRUE);
+
+  /* If user didn't supply -outfile switch, select output file name. */
+  if (outfilename == NULL) {
+    int i;
+
+    outfilename = workfilename;
+    /* Make outfilename be infilename with .jpg substituted for extension */
+    strcpy(outfilename, infilename);
+    for (i = strlen(outfilename)-1; i >= 0; i--) {
+      switch (outfilename[i]) {
+      case ':':
+      case '/':
+      case '\\':
+	i = 0;			/* stop scanning */
+	break;
+      case '.':
+	outfilename[i] = '\0';	/* lop off existing extension */
+	i = 0;			/* stop scanning */
+	break;
+      default:
+	break;			/* keep scanning */
+      }
+    }
+    strcat(outfilename, ".jpg");
+  }
+
+  fprintf(stderr, "Compressing %s => %s\n", infilename, outfilename);
+#ifndef NO_OVERWRITE_CHECK
+  if (! is_write_ok(outfilename))
+    goto fail;
+#endif
+
+  /* Open the output file. */
+  if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't create %s\n", progname, outfilename);
+    goto fail;
+  }
+
+  /* Specify data destination for compression */
+  jpeg_stdio_dest(&cinfo, output_file);
+
+  /* Start compressor */
+  jpeg_start_compress(&cinfo, TRUE);
+
+  /* Process data */
+  while (cinfo.next_scanline < cinfo.image_height) {
+    num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
+    (void) jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
+  }
+
+  /* Finish compression and release memory */
+  (*src_mgr->finish_input) (&cinfo, src_mgr);
+  jpeg_finish_compress(&cinfo);
+
+  /* Clean up and exit */
+fail:
+  jpeg_destroy_compress(&cinfo);
+
+  if (input_file != NULL) fclose(input_file);
+  if (output_file != NULL) fclose(output_file);
+
+#ifdef PROGRESS_REPORT
+  end_progress_monitor((j_common_ptr) &cinfo);
+#endif
+
+  /* Disable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) NULL);
+#endif
+
+  return file_index;
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main (int argc, char **argv)
+{
+  int file_index;
+
+  /* On Mac, fetch a command line. */
+#ifdef USE_CCOMMAND
+  argc = ccommand(&argv);
+#endif
+
+#ifdef MSDOS
+  progname = "cjpeg";		/* DOS tends to be too verbose about argv[0] */
+#else
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "cjpeg";		/* in case C library doesn't provide it */
+#endif
+
+  /* The default maxmem must be computed only once at program startup,
+   * since releasing memory with free() won't give it back to the OS.
+   */
+#ifdef FREE_MEM_ESTIMATE
+  default_maxmem = FREE_MEM_ESTIMATE;
+#else
+  default_maxmem = 0;
+#endif
+
+  /* Scan command line, parse switches and locate input file names */
+
+  if (argc < 2)
+    usage();			/* nothing on the command line?? */
+
+  file_index = 0;
+
+  while (file_index < argc-1)
+    file_index = process_one_file(argc, argv, file_index);
+
+  /* All done. */
+  exit(EXIT_SUCCESS);
+  return 0;			/* suppress no-return-value warnings */
+}
--- a/altui/djpeg.c
+++ b/altui/djpeg.c
@@ -0,0 +1,836 @@
+/*
+ * alternate djpeg.c
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 6, 2006
+ * ---------------------------------------------------------------------
+ *
+ * This file contains an alternate user interface for the JPEG decompressor.
+ * One or more input files are named on the command line, and output file
+ * names are created by substituting an appropriate extension.
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"		/* for version message */
+
+#include <ctype.h>		/* to declare isprint() */
+
+#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef __MWERKS__
+#include <SIOUX.h>              /* Metrowerks needs this */
+#include <console.h>		/* ... and this */
+#endif
+#ifdef THINK_C
+#include <console.h>		/* Think declares it here */
+#endif
+#endif
+
+#ifndef PATH_MAX		/* ANSI maximum-pathname-length constant */
+#define PATH_MAX 256
+#endif
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code,string)	string ,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * SIMD Ext: compiler-specific hacks to enable filename wild-card expansion
+ */
+
+#ifdef _MSC_VER		/* Microsoft Visual C++ */
+/* from setargv.c (setargv.obj) */
+/* Tested under Visual C++ V6.0, Toolkit 2003, and 2005 Express Edition */
+int __cdecl _setargv(void) { int __cdecl __setargv(void); return __setargv(); }
+#endif
+#ifdef __BORLANDC__	/* Borland C++ */
+/* from wildargs.c (wildargs.obj) */
+/* Tested under Borland C++ Compiler 5.5 (win32) */
+#include <wildargs.h>
+typedef void _RTLENTRY (* _RTLENTRY _argv_expand_fnc)(char *, _PFN_ADDARG);
+_argv_expand_fnc _argv_expand_ptr = _expand_wild;
+#endif
+
+
+/*
+ * Automatic determination of available memory.
+ */
+
+static long default_maxmem;	/* saves value determined at startup, or 0 */
+
+#ifndef FREE_MEM_ESTIMATE	/* may be defined from command line */
+
+#ifdef MSDOS			/* For MS-DOS (unless flat-memory model) */
+
+#include <dos.h>		/* for access to intdos() call */
+
+LOCAL(long)
+unused_dos_memory (void)
+/* Obtain total amount of unallocated DOS memory */
+{
+  union REGS regs;
+  long nparas;
+
+  regs.h.ah = 0x48;		/* DOS function Allocate Memory Block */
+  regs.x.bx = 0xFFFF;		/* Ask for more memory than DOS can have */
+  (void) intdos(&regs, &regs);
+  /* DOS will fail and return # of paragraphs actually available in BX. */
+  nparas = (unsigned int) regs.x.bx;
+  /* Times 16 to convert to bytes. */
+  return nparas << 4;
+}
+
+/* The default memory setting is 95% of the available space. */
+#define FREE_MEM_ESTIMATE  ((unused_dos_memory() * 95L) / 100L)
+
+#endif /* MSDOS */
+
+#ifdef ATARI			/* For Atari ST/STE/TT, Pure C or Turbo C */
+
+#include <ext.h>
+
+/* The default memory setting is 90% of the available space. */
+#define FREE_MEM_ESTIMATE  (((long) coreleft() * 90L) / 100L)
+
+#endif /* ATARI */
+
+/* Add memory-estimation procedures for other operating systems here,
+ * with appropriate #ifdef's around them.
+ */
+
+#endif /* !FREE_MEM_ESTIMATE */
+
+
+/*
+ * This list defines the known output image formats
+ * (not all of which need be supported by a given version).
+ * You can change the default output format by defining DEFAULT_FMT;
+ * indeed, you had better do so if you undefine PPM_SUPPORTED.
+ */
+
+typedef enum {
+	FMT_BMP,		/* BMP format (Windows flavor) */
+	FMT_GIF,		/* GIF format */
+	FMT_OS2,		/* BMP format (OS/2 flavor) */
+	FMT_PPM,		/* PPM/PGM (PBMPLUS formats) */
+	FMT_RLE,		/* RLE format */
+	FMT_TARGA,		/* Targa format */
+	FMT_TIFF		/* TIFF format */
+} IMAGE_FORMATS;
+
+#ifndef DEFAULT_FMT		/* so can override from CFLAGS in Makefile */
+#define DEFAULT_FMT	FMT_GIF
+#endif
+
+static IMAGE_FORMATS requested_fmt;
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ */
+
+
+static const char * progname;	/* program name for error messages */
+static char * outfilename;	/* for -outfile switch */
+
+
+LOCAL(void)
+usage (void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] inputfile(s)\n", progname);
+  fprintf(stderr, "List of input files may use wildcards (* and ?)\n");
+  fprintf(stderr, "Output filename is same as input filename except for extension\n");
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -colors N      Reduce image to no more than N colors\n");
+  fprintf(stderr, "  -fast          Fast, low-quality processing\n");
+  fprintf(stderr, "  -grayscale     Force grayscale output\n");
+#ifdef IDCT_SCALING_SUPPORTED
+  fprintf(stderr, "  -scale M/N     Scale output image by fraction M/N, eg, 1/8\n");
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
+	  (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
+#endif
+#ifdef GIF_SUPPORTED
+  fprintf(stderr, "  -gif           Select GIF output format%s\n",
+	  (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
+	  (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
+#endif
+#ifdef PPM_SUPPORTED
+  fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
+	  (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
+#endif
+#ifdef RLE_SUPPORTED
+  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
+	  (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Select Targa output format%s\n",
+	  (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
+	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
+  fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
+  fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
+#ifdef QUANT_2PASS_SUPPORTED
+  fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
+#endif
+  fprintf(stderr, "  -nosmooth      Don't use high-quality upsampling\n");
+#ifdef QUANT_1PASS_SUPPORTED
+  fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
+#endif
+#ifndef FREE_MEM_ESTIMATE
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+#endif
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  exit(EXIT_FAILURE);
+}
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
+LOCAL(int)
+parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
+		int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char * arg;
+
+  /* Set up default JPEG parameters. */
+  requested_fmt = DEFAULT_FMT;	/* set default output file format */
+  outfilename = NULL;
+  cinfo->err->trace_level = 0;
+  if (default_maxmem > 0)	/* override library's default value */
+    cinfo->mem->max_memory_to_use = default_maxmem;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+	outfilename = NULL;	/* -outfile applies to just one input file */
+	continue;		/* ignore this name if previously processed */
+      }
+      break;			/* else done parsing switches */
+    }
+    arg++;			/* advance past switch marker character */
+
+    if (keymatch(arg, "bmp", 1)) {
+      /* BMP output format. */
+      requested_fmt = FMT_BMP;
+
+    } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
+	       keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
+      /* Do color quantization. */
+      int val;
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+	usage();
+      cinfo->desired_number_of_colors = val;
+      cinfo->quantize_colors = TRUE;
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select IDCT algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "int", 1)) {
+	cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+	cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+	cinfo->dct_method = JDCT_FLOAT;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "dither", 2)) {
+      /* Select dithering algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "fs", 2)) {
+	cinfo->dither_mode = JDITHER_FS;
+      } else if (keymatch(argv[argn], "none", 2)) {
+	cinfo->dither_mode = JDITHER_NONE;
+      } else if (keymatch(argv[argn], "ordered", 2)) {
+	cinfo->dither_mode = JDITHER_ORDERED;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (! printed_version) {
+	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
+		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT));
+#endif
+#ifdef IDCT_SCALING_SUPPORTED
+	print_simd_info(stderr, "Reduced-size DCT      (-scale M/N) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT+1));
+#endif
+	print_simd_info(stderr, "High-quality upsampling (default)  :",
+			jpeg_simd_upsampler(cinfo, TRUE));
+	print_simd_info(stderr, "Low-quality upsampling (-nosmooth) :",
+			jpeg_simd_upsampler(cinfo, FALSE));
+	print_simd_info(stderr, "Colorspace conversion (YCbCr->RGB) :",
+			jpeg_simd_color_deconverter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+	printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "fast", 1)) {
+      /* Select recommended processing options for quick-and-dirty output. */
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->dither_mode = JDITHER_ORDERED;
+      if (! cinfo->quantize_colors) /* don't override an earlier -colors */
+	cinfo->desired_number_of_colors = 216;
+      cinfo->dct_method = JDCT_FASTEST;
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "gif", 1)) {
+      /* GIF output format. */
+      requested_fmt = FMT_GIF;
+
+    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+      /* Force monochrome output. */
+      cinfo->out_color_space = JCS_GRAYSCALE;
+
+    } else if (keymatch(arg, "map", 3)) {
+      /* Quantize to a color map taken from an input file. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (for_real) {		/* too expensive to do twice! */
+#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
+	FILE * mapfile;
+
+	if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
+	  fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+	  exit(EXIT_FAILURE);
+	}
+	read_color_map(cinfo, mapfile);
+	fclose(mapfile);
+	cinfo->quantize_colors = TRUE;
+#else
+	ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+      }
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (ch == 'm' || ch == 'M')
+	lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "nosmooth", 3)) {
+      /* Suppress fancy upsampling */
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "onepass", 3)) {
+      /* Use fast one-pass quantization. */
+      cinfo->two_pass_quantize = FALSE;
+
+    } else if (keymatch(arg, "os2", 3)) {
+      /* BMP output format (OS/2 flavor). */
+      requested_fmt = FMT_OS2;
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      outfilename = argv[argn];	/* save it away for later use */
+
+    } else if (keymatch(arg, "pnm", 1) || keymatch(arg, "ppm", 1)) {
+      /* PPM/PGM output format. */
+      requested_fmt = FMT_PPM;
+
+    } else if (keymatch(arg, "rle", 1)) {
+      /* RLE output format. */
+      requested_fmt = FMT_RLE;
+
+    } else if (keymatch(arg, "scale", 1)) {
+      /* Scale the output image by a fraction M/N. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d/%d",
+		 &cinfo->scale_num, &cinfo->scale_denom) != 2)
+	usage();
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Targa output format. */
+      requested_fmt = FMT_TARGA;
+
+    } else {
+      usage();			/* bogus switch */
+    }
+  }
+
+  return argn;			/* return index of next arg (file name) */
+}
+
+
+/*
+ * Marker processor for COM and interesting APPn markers.
+ * This replaces the library's built-in processor, which just skips the marker.
+ * We want to print out the marker as text, to the extent possible.
+ * Note this code relies on a non-suspending data source.
+ */
+
+LOCAL(unsigned int)
+jpeg_getc (j_decompress_ptr cinfo)
+/* Read next byte */
+{
+  struct jpeg_source_mgr * datasrc = cinfo->src;
+
+  if (datasrc->bytes_in_buffer == 0) {
+    if (! (*datasrc->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  }
+  datasrc->bytes_in_buffer--;
+  return GETJOCTET(*datasrc->next_input_byte++);
+}
+
+
+METHODDEF(boolean)
+print_text_marker (j_decompress_ptr cinfo)
+{
+  boolean traceit = (cinfo->err->trace_level >= 1);
+  INT32 length;
+  unsigned int ch;
+  unsigned int lastch = 0;
+
+  length = jpeg_getc(cinfo) << 8;
+  length += jpeg_getc(cinfo);
+  length -= 2;			/* discount the length word itself */
+
+  if (traceit) {
+    if (cinfo->unread_marker == JPEG_COM)
+      fprintf(stderr, "Comment, length %ld:\n", (long) length);
+    else			/* assume it is an APPn otherwise */
+      fprintf(stderr, "APP%d, length %ld:\n",
+	      cinfo->unread_marker - JPEG_APP0, (long) length);
+  }
+
+  while (--length >= 0) {
+    ch = jpeg_getc(cinfo);
+    if (traceit) {
+      /* Emit the character in a readable form.
+       * Nonprintables are converted to \nnn form,
+       * while \ is converted to \\.
+       * Newlines in CR, CR/LF, or LF form will be printed as one newline.
+       */
+      if (ch == '\r') {
+	fprintf(stderr, "\n");
+      } else if (ch == '\n') {
+	if (lastch != '\r')
+	  fprintf(stderr, "\n");
+      } else if (ch == '\\') {
+	fprintf(stderr, "\\\\");
+      } else if (isprint(ch)) {
+	putc(ch, stderr);
+      } else {
+	fprintf(stderr, "\\%03o", ch);
+      }
+      lastch = ch;
+    }
+  }
+
+  if (traceit)
+    fprintf(stderr, "\n");
+
+  return TRUE;
+}
+
+
+/*
+ * Check for overwrite of an existing file; clear it with user
+ */
+
+#ifndef NO_OVERWRITE_CHECK
+
+LOCAL(boolean)
+is_write_ok (char * outfname)
+{
+  FILE * ofile;
+  int ch;
+
+  ofile = fopen(outfname, READ_BINARY);
+  if (ofile == NULL)
+    return TRUE;		/* not present */
+  fclose(ofile);		/* oops, it is present */
+
+  for (;;) {
+    fprintf(stderr, "%s already exists, overwrite it? [y/n] ",
+	    outfname);
+    fflush(stderr);
+    ch = getc(stdin);
+    if (ch != '\n')		/* flush rest of line */
+      while (getc(stdin) != '\n')
+	/* nothing */;
+
+    switch (ch) {
+    case 'Y':
+    case 'y':
+      return TRUE;
+    case 'N':
+    case 'n':
+      return FALSE;
+    /* otherwise, ask again */
+    }
+  }
+}
+
+#endif
+
+
+/*
+ * Process a single input file name, and return its index in argv[].
+ * File names at or to left of old_file_index have been processed already.
+ */
+
+LOCAL(int)
+process_one_file (int argc, char **argv, int old_file_index)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  char *infilename;
+  char workfilename[PATH_MAX];
+  const char *default_extension = NULL;
+#ifdef PROGRESS_REPORT
+  struct cdjpeg_progress_mgr progress;
+#endif
+  int file_index;
+  djpeg_dest_ptr dest_mgr = NULL;
+  FILE * input_file = NULL;
+  FILE * output_file = NULL;
+  JDIMENSION num_scanlines;
+
+  /* Initialize the JPEG decompression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Insert custom marker processor for COM and APP12.
+   * APP12 is used by some digital camera makers for textual info,
+   * so we provide the ability to display it as text.
+   * If you like, additional APPn marker types can be selected for display,
+   * but don't try to override APP0 or APP14 this way (see libjpeg.doc).
+   */
+  jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
+  jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
+
+  /* Now safe to enable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) &cinfo);
+#endif
+
+  /* Scan command line to find next file name.
+   * It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   * (Exception: tracing level set here controls verbosity for COM markers
+   * found during jpeg_read_header...)
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, FALSE);
+  if (file_index >= argc) {
+    fprintf(stderr, "%s: missing input file name\n", progname);
+    usage();
+  }
+
+  /* Open the input file. */
+  infilename = argv[file_index];
+  if ((input_file = fopen(infilename, READ_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't open %s\n", progname, infilename);
+    goto fail;
+  }
+
+#ifdef PROGRESS_REPORT
+  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+#endif
+
+  /* Specify data source for decompression */
+  jpeg_stdio_src(&cinfo, input_file);
+
+  /* Read file header, set default decompression parameters */
+  (void) jpeg_read_header(&cinfo, TRUE);
+
+  /* Adjust default decompression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, TRUE);
+
+  /* Initialize the output module now to let it override any crucial
+   * option settings (for instance, GIF wants to force color quantization).
+   */
+  switch (requested_fmt) {
+#ifdef BMP_SUPPORTED
+  case FMT_BMP:
+    dest_mgr = jinit_write_bmp(&cinfo, FALSE);
+    default_extension = ".bmp";
+    break;
+  case FMT_OS2:
+    dest_mgr = jinit_write_bmp(&cinfo, TRUE);
+    default_extension = ".bmp";
+    break;
+#endif
+#ifdef GIF_SUPPORTED
+  case FMT_GIF:
+    dest_mgr = jinit_write_gif(&cinfo);
+    default_extension = ".gif";
+    break;
+#endif
+#ifdef PPM_SUPPORTED
+  case FMT_PPM:
+    dest_mgr = jinit_write_ppm(&cinfo);
+    default_extension = ".ppm";
+    break;
+#endif
+#ifdef RLE_SUPPORTED
+  case FMT_RLE:
+    dest_mgr = jinit_write_rle(&cinfo);
+    default_extension = ".rle";
+    break;
+#endif
+#ifdef TARGA_SUPPORTED
+  case FMT_TARGA:
+    dest_mgr = jinit_write_targa(&cinfo);
+    default_extension = ".tga";
+    break;
+#endif
+  default:
+    ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
+    break;
+  }
+
+  /* If user didn't supply -outfile switch, select output file name. */
+  if (outfilename == NULL) {
+    int i;
+
+    outfilename = workfilename;
+    /* Make outfilename be infilename with appropriate extension */
+    strcpy(outfilename, infilename);
+    for (i = strlen(outfilename)-1; i >= 0; i--) {
+      switch (outfilename[i]) {
+      case ':':
+      case '/':
+      case '\\':
+	i = 0;			/* stop scanning */
+	break;
+      case '.':
+	outfilename[i] = '\0';	/* lop off existing extension */
+	i = 0;			/* stop scanning */
+	break;
+      default:
+	break;			/* keep scanning */
+      }
+    }
+    strcat(outfilename, default_extension);
+  }
+
+  fprintf(stderr, "Decompressing %s => %s\n", infilename, outfilename);
+#ifndef NO_OVERWRITE_CHECK
+  if (! is_write_ok(outfilename))
+    goto fail;
+#endif
+
+  /* Open the output file. */
+  if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't create %s\n", progname, outfilename);
+    goto fail;
+  }
+  dest_mgr->output_file = output_file;
+
+  /* Start decompressor */
+  (void) jpeg_start_decompress(&cinfo);
+
+  /* Write output file header */
+  (*dest_mgr->start_output) (&cinfo, dest_mgr);
+
+  /* Process data */
+  while (cinfo.output_scanline < cinfo.output_height) {
+    num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+					dest_mgr->buffer_height);
+    (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+  }
+
+#ifdef PROGRESS_REPORT
+  /* Hack: count final pass as done in case finish_output does an extra pass.
+   * The library won't have updated completed_passes.
+   */
+  progress.pub.completed_passes = progress.pub.total_passes;
+#endif
+
+  /* Finish decompression and release memory.
+   * I must do it in this order because output module has allocated memory
+   * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
+   */
+  (*dest_mgr->finish_output) (&cinfo, dest_mgr);
+  (void) jpeg_finish_decompress(&cinfo);
+
+  /* Clean up and exit */
+fail:
+  jpeg_destroy_decompress(&cinfo);
+
+  if (input_file != NULL) fclose(input_file);
+  if (output_file != NULL) fclose(output_file);
+
+#ifdef PROGRESS_REPORT
+  end_progress_monitor((j_common_ptr) &cinfo);
+#endif
+
+  /* Disable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) NULL);
+#endif
+
+  return file_index;
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main (int argc, char **argv)
+{
+  int file_index;
+
+  /* On Mac, fetch a command line. */
+#ifdef USE_CCOMMAND
+  argc = ccommand(&argv);
+#endif
+
+#ifdef MSDOS
+  progname = "djpeg";		/* DOS tends to be too verbose about argv[0] */
+#else
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "djpeg";		/* in case C library doesn't provide it */
+#endif
+
+  /* The default maxmem must be computed only once at program startup,
+   * since releasing memory with free() won't give it back to the OS.
+   */
+#ifdef FREE_MEM_ESTIMATE
+  default_maxmem = FREE_MEM_ESTIMATE;
+#else
+  default_maxmem = 0;
+#endif
+
+  /* Scan command line, parse switches and locate input file names */
+
+  if (argc < 2)
+    usage();			/* nothing on the command line?? */
+
+  file_index = 0;
+
+  while (file_index < argc-1)
+    file_index = process_one_file(argc, argv, file_index);
+
+  /* All done. */
+  exit(EXIT_SUCCESS);
+  return 0;			/* suppress no-return-value warnings */
+}
--- a/altui/usage.alt
+++ b/altui/usage.alt
@@ -0,0 +1,62 @@
+(Most of the standard usage.doc file also applies to this alternate version,
+but replace its "GENERAL USAGE" section with the text below.  Edit the text
+as necessary if you don't support wildcards or overwrite checking.  Be sure
+to fix the djpeg switch descriptions if you are not defaulting to PPM output.
+Also, if you've provided an accurate memory-estimation procedure, you can
+probably eliminate the HINTS related to the -maxmemory switch.)
+
+
+GENERAL USAGE
+
+We provide two programs, cjpeg to compress an image file into JPEG format,
+and djpeg to decompress a JPEG file back into a conventional image format.
+
+The basic command line is:
+	cjpeg [switches] list of image files
+or
+	djpeg [switches] list of jpeg files
+
+Each file named is compressed or decompressed.  The input file(s) are not
+modified; the output data is written to files which have the same names
+except for extension.  cjpeg always uses ".jpg" for the output file name's
+extension; djpeg uses one of ".bmp", ".gif", ".ppm", ".rle", or ".tga",
+depending on what output format is selected by the switches.
+
+For example, to convert xxx.bmp to xxx.jpg and yyy.ppm to yyy.jpg, say:
+	cjpeg xxx.bmp yyy.ppm
+
+On most systems you can use standard wildcards to specify the list of input
+files; for example, on DOS "djpeg *.jpg" decompresses all the JPEG files in
+the current directory.
+
+If an intended output file already exists, you'll be asked whether or not to
+overwrite it.  If you say no, the program skips that input file and goes on
+to the next one.
+
+You can intermix switches and file names; for example
+	djpeg -gif file1.jpg -targa file2.jpg
+decompresses file1.jpg into GIF format (file1.gif) and file2.jpg into Targa
+format (file2.tga).  Only switches to the left of a given file name affect
+processing of that file; when there are conflicting switches, the rightmost
+one takes precedence.
+
+You can override the program's choice of output file name by using the
+-outfile switch, as in
+	cjpeg -outfile output.jpg input.ppm
+-outfile only affects the first input file name to its right.
+
+The currently supported image file formats are: PPM (PBMPLUS color format),
+PGM (PBMPLUS gray-scale format), BMP, GIF, Targa, and RLE (Utah Raster
+Toolkit format).  (RLE is supported only if the URT library is available,
+which it isn't on most non-Unix systems.)  cjpeg recognizes the input image
+format automatically, with the exception of some Targa-format files.  You
+have to tell djpeg which format to generate.
+
+JPEG files are in the defacto standard JFIF file format.  There are other,
+less widely used JPEG-based file formats, but we don't support them.
+
+All switch names may be abbreviated; for example, -grayscale may be written
+-gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
+one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
+British spellings are also accepted (e.g., -greyscale), though for brevity
+these are not mentioned below.
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : August 23, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains a command-line user interface for the JPEG compressor.
 * It should work on any system with Unix- or MS-DOS-style command lines.
 *
@@ -195,6 +202,22 @@ usage (void)
 }


+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
 LOCAL(int)
 parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 		int last_file_arg_seen, boolean for_real)
@@ -258,6 +281,19 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
      /* Force baseline-compatible output (8-bit quantizer values). */
      force_baseline = TRUE;

+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
    } else if (keymatch(arg, "dct", 2)) {
      /* Select DCT algorithm. */
      if (++argn >= argc)	/* advance to next argument */
@@ -279,6 +315,32 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv,
      if (! printed_version) {
 	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
 		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_forward_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_forward_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_forward_dct(cinfo, JDCT_FLOAT));
+#endif
+	print_simd_info(stderr, "Downsampling (-sample 2x2 or 2x1)  :",
+			jpeg_simd_downsampler(cinfo));
+	print_simd_info(stderr, "Colorspace conversion (RGB->YCbCr) :",
+			jpeg_simd_color_converter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 	printed_version = TRUE;
      }
      cinfo->err->trace_level++;
--- a/ckconfig.c
+++ b/ckconfig.c
@@ -4,6 +4,13 @@
 * Copyright (C) 1991-1994, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 28, 2005
+ * ---------------------------------------------------------------------
 */

 /*
@@ -361,6 +368,10 @@ int main (argc, argv)
  fprintf(outfile, "#define INCOMPLETE_TYPES_BROKEN\n");
 #else
  fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
+#endif
+#ifdef _WIN32
+  fprintf(outfile, "\n/* Define "boolean" as unsigned char, not int, per Windows custom */\n");
+  fprintf(outfile, "#define TYPEDEF_UCHAR_BOOLEAN\n");
 #endif
  fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
  if (is_shifting_signed(-0x7F7E80B1L))
@@ -368,6 +379,14 @@ int main (argc, argv)
  else
    fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
  fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
+
+  fprintf(outfile, "\n#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)\n");
+  fprintf(outfile, "#undef JSIMD_MMX_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_3DNOW_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_SSE_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_SSE2_NOT_SUPPORTED\n");
+  fprintf(outfile, "#endif\n");
+
  fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
  fprintf(outfile, "#define BMP_SUPPORTED		/* BMP image file format */\n");
  fprintf(outfile, "#define GIF_SUPPORTED		/* GIF image file format */\n");
@@ -375,6 +394,9 @@ int main (argc, argv)
  fprintf(outfile, "#undef RLE_SUPPORTED		/* Utah RLE image file format */\n");
  fprintf(outfile, "#define TARGA_SUPPORTED		/* Targa image file format */\n\n");
  fprintf(outfile, "#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */\n");
+#ifdef _WIN32
+  fprintf(outfile, "#define USE_SETMODE		/* Needed to make one-file style work */\n");
+#endif
  fprintf(outfile, "#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */\n");
  fprintf(outfile, "#undef DONT_USE_B_MODE\n");
  fprintf(outfile, "/* #define PROGRESS_REPORT */	/* optional */\n");
--- a/config.guess
+++ b/config.guess
--- a/config.sub
+++ b/config.sub
--- a/config.ver
+++ b/config.ver
@@ -0,0 +1,44 @@
+
+JPEG_VER_MAJOR=62
+JPEG_VER_MINOR=1
+JPEG_REVISION=0
+
+case $host_os in
+  cygwin*)
+    # The shared library built from this source code is *not* binary
+    # compatible with the cygwin's official binary release (cygjpeg-62.dll).
+    # This is because the official binary has been built with
+    # the lossless jpeg patch which is available as ljpeg-6b.tar.gz .
+    # Therefore we decided to give the shared library the version number
+    # other than 62.
+    #
+    JPEG_VER_MAJOR=162
+    JPEG_VER_MINOR=0
+    ;;
+  freebsd*)
+    # This follows the official binary release in the ports collection.
+    JPEG_VER_MAJOR=9
+    ;;
+esac
+
+# convert absolute version numbers to libtool ages
+case $version_type in
+  freebsd-aout|freebsd-elf|sunos)
+    JPEG_LT_CURRENT=$JPEG_VER_MAJOR
+    JPEG_LT_REVISION=$JPEG_VER_MINOR
+    JPEG_LT_AGE=0
+    ;;
+  irix|nonstopux)
+    JPEG_LT_CURRENT=`expr $JPEG_VER_MAJOR + $JPEG_VER_MINOR - 1`
+    JPEG_LT_AGE=$JPEG_VER_MINOR
+    JPEG_LT_REVISION=$JPEG_VER_MINOR
+    ;;
+  *)
+    JPEG_LT_CURRENT=`expr $JPEG_VER_MAJOR + $JPEG_VER_MINOR`
+    JPEG_LT_AGE=$JPEG_VER_MINOR
+    JPEG_LT_REVISION=$JPEG_REVISION
+    ;;
+esac
+
+JPEG_LIB_VERSION=$JPEG_LT_CURRENT:$JPEG_LT_REVISION:$JPEG_LT_AGE
+
--- a/5363
+++ b/5363
--- a/configure.in
+++ b/configure.in
@@ -0,0 +1,634 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_INIT([jcmaster.c])
+AC_CONFIG_HEADER([jconfig.h:jconfig.cfg])
+dnl --------------------------------------------------------------------
+AC_PROG_CC
+AC_PROG_CPP
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for function prototypes])
+AC_CACHE_VAL([ijg_cv_have_prototypes],[AC_TRY_COMPILE([
+int testfunction (int arg1, int * arg2); /* check prototypes */
+struct methods_struct {		/* check method-pointer declarations */
+  int (*error_exit) (char *msgtext);
+  int (*trace_message) (char *msgtext);
+  int (*another_method) (void);
+};
+int testfunction (int arg1, int * arg2) /* check definitions */
+{ return arg2[arg1]; }
+int test2function (void)	/* check void arg list */
+{ return 0; }
+],[ ],[ijg_cv_have_prototypes=yes],[ijg_cv_have_prototypes=no])])
+AC_MSG_RESULT([$ijg_cv_have_prototypes])
+if test $ijg_cv_have_prototypes = yes; then
+  AC_DEFINE([HAVE_PROTOTYPES],)
+else
+  echo [Your compiler does not seem to know about function prototypes.]
+  echo [Perhaps it needs a special switch to enable ANSI C mode.]
+  echo [If so, we recommend running configure like this:]
+  echo ["   ./configure  CC='cc -switch'"]
+  echo [where -switch is the proper switch.]
+fi
+dnl --------------------------------------------------------------------
+AC_CHECK_HEADER([stddef.h],[AC_DEFINE([HAVE_STDDEF_H],)])
+AC_CHECK_HEADER([stdlib.h],[AC_DEFINE([HAVE_STDLIB_H],)])
+AC_CHECK_HEADER([string.h],[:],[AC_DEFINE([NEED_BSD_STRINGS],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for size_t])
+AC_TRY_COMPILE([
+#ifdef HAVE_STDDEF_H
+#include <stddef.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#include <stdio.h>
+#ifdef NEED_BSD_STRINGS
+#include <strings.h>
+#else
+#include <string.h>
+#endif
+typedef size_t my_size_t;
+],[ my_size_t foovar; ],
+[ijg_size_t_ok=yes],
+[ijg_size_t_ok="not ANSI, perhaps it is in sys/types.h"])
+AC_MSG_RESULT([$ijg_size_t_ok])
+if test "$ijg_size_t_ok" != yes; then
+AC_CHECK_HEADER([sys/types.h],[AC_DEFINE([NEED_SYS_TYPES_H],)
+AC_EGREP_HEADER([size_t],[sys/types.h],
+[ijg_size_t_ok="size_t is in sys/types.h"],[ijg_size_t_ok=no])],
+[ijg_size_t_ok=no])
+AC_MSG_RESULT([$ijg_size_t_ok])
+if test "$ijg_size_t_ok" = no; then
+  echo [Type size_t is not defined in any of the usual places.]
+  echo [Try putting '"typedef unsigned int size_t;"' in jconfig.h.]
+fi
+fi
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type unsigned char])
+AC_TRY_COMPILE(,[ unsigned char un_char; ],[AC_MSG_RESULT(yes)
+AC_DEFINE([HAVE_UNSIGNED_CHAR],)],[AC_MSG_RESULT(no)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type unsigned short])
+AC_TRY_COMPILE(,[ unsigned short un_short; ],[AC_MSG_RESULT(yes)
+AC_DEFINE([HAVE_UNSIGNED_SHORT],)],[AC_MSG_RESULT(no)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type void])
+AC_TRY_COMPILE([
+/* Caution: a C++ compiler will insist on valid prototypes */
+typedef void * void_ptr;	/* check void * */
+#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
+typedef void (*void_func) (int a, int b);
+#else
+typedef void (*void_func) ();
+#endif
+
+#ifdef HAVE_PROTOTYPES		/* check void function result */
+void test3function (void_ptr arg1, void_func arg2)
+#else
+void test3function (arg1, arg2)
+     void_ptr arg1;
+     void_func arg2;
+#endif
+{
+  char * locptr = (char *) arg1; /* check casting to and from void * */
+  arg1 = (void *) locptr;
+  (*arg2) (1, 2);		/* check call of fcn returning void */
+}
+],[ ],[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([void],[char])])
+
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for working const])
+AC_CACHE_VAL([ac_cv_c_const],[AC_TRY_COMPILE(,[
+/* Ultrix mips cc rejects this.  */
+typedef int charset[2]; const charset x;
+/* SunOS 4.1.1 cc rejects this.  */
+char const *const *ccp;
+char **p;
+/* NEC SVR4.0.2 mips cc rejects this.  */
+struct point {int x, y;};
+static struct point const zero = {0,0};
+/* AIX XL C 1.02.0.0 rejects this.
+   It does not let you subtract one const X* pointer from another in an arm
+   of an if-expression whose if-part is not a constant expression */
+const char *g = "string";
+ccp = &g + (g ? g-g : 0);
+/* HPUX 7.0 cc rejects these. */
++ccp;
+p = (char**) ccp;
+ccp = (char const *const *) p;
+{ /* SCO 3.2v4 cc rejects this.  */
+  char *t;
+  char const *s = 0 ? (char *) 0 : (char const *) 0;
+
+  *t++ = 0;
+}
+{ /* Someone thinks the Sun supposedly-ANSI compiler will reject this.  */
+  int x[] = {25, 17};
+  const int *foo = &x[0];
+  ++foo;
+}
+{ /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */
+  typedef const int *iptr;
+  iptr p = 0;
+  ++p;
+}
+{ /* AIX XL C 1.02.0.0 rejects this saying
+     "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
+  struct s { int j; const int *ap[3]; };
+  struct s *b; b->j = 5;
+}
+{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
+  const int foo = 10;
+}
+],[ac_cv_c_const=yes],[ac_cv_c_const=no])])
+AC_MSG_RESULT([$ac_cv_c_const])
+if test $ac_cv_c_const = no; then
+  AC_DEFINE([const],)
+fi
+
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for inline])
+ijg_cv_inline=""
+AC_TRY_COMPILE(,[} __inline__ int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="__inline__"],
+[AC_TRY_COMPILE(,[} __inline int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="__inline"],
+[AC_TRY_COMPILE(,[} inline int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="inline"],)])])
+AC_MSG_RESULT([$ijg_cv_inline])
+AC_DEFINE_UNQUOTED([INLINE],[$ijg_cv_inline])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for broken incomplete types])
+AC_TRY_COMPILE([ typedef struct undefined_structure * undef_struct_ptr; ],
+,[AC_MSG_RESULT(ok)],[AC_MSG_RESULT(broken)
+AC_DEFINE([INCOMPLETE_TYPES_BROKEN],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for short external names])
+AC_TRY_LINK([
+int possibly_duplicate_function () { return 0; }
+int possibly_dupli_function () { return 1; }
+],[ ],[AC_MSG_RESULT(ok)],[AC_MSG_RESULT(short)
+AC_DEFINE([NEED_SHORT_EXTERNAL_NAMES],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if char is signed])
+AC_TRY_RUN([
+#ifdef HAVE_PROTOTYPES
+int is_char_signed (int arg)
+#else
+int is_char_signed (arg)
+     int arg;
+#endif
+{
+  if (arg == 189) {		/* expected result for unsigned char */
+    return 0;			/* type char is unsigned */
+  }
+  else if (arg != -67) {	/* expected result for signed char */
+    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
+    printf("I fear the JPEG software will not work at all.\n\n");
+  }
+  return 1;			/* assume char is signed otherwise */
+}
+char signed_char_check = (char) (-67);
+main() {
+  exit(is_char_signed((int) signed_char_check));
+}],[AC_MSG_RESULT(no)
+AC_DEFINE([CHAR_IS_UNSIGNED],)],[AC_MSG_RESULT(yes)],
+[echo Assuming that char is signed on target machine.
+echo If it is unsigned, this will be a little bit inefficient.
+])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if right shift is signed])
+AC_TRY_RUN([
+#ifdef HAVE_PROTOTYPES
+int is_shifting_signed (long arg)
+#else
+int is_shifting_signed (arg)
+     long arg;
+#endif
+/* See whether right-shift on a long is signed or not. */
+{
+  long res = arg >> 4;
+
+  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
+    return 1;			/* right shift is signed */
+  }
+  /* see if unsigned-shift hack will fix it. */
+  /* we can't just test exact value since it depends on width of long... */
+  res |= (~0L) << (32-4);
+  if (res == -0x7F7E80CL) {	/* expected result now? */
+    return 0;			/* right shift is unsigned */
+  }
+  printf("Right shift isn't acting as I expect it to.\n");
+  printf("I fear the JPEG software will not work at all.\n\n");
+  return 0;			/* try it with unsigned anyway */
+}
+main() {
+  exit(is_shifting_signed(-0x7F7E80B1L));
+}],[AC_MSG_RESULT(no)
+AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED],)],[AC_MSG_RESULT(yes)],
+[AC_MSG_RESULT([Assuming that right shift is signed on target machine.])])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if fopen accepts b spec])
+AC_TRY_RUN([
+#include <stdio.h>
+main() {
+  if (fopen("conftestdata", "wb") != NULL)
+    exit(0);
+  exit(1);
+}],[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([DONT_USE_B_MODE],)],[AC_MSG_RESULT([Assuming that it does.])])
+dnl --------------------------------------------------------------------
+AC_PROG_INSTALL
+AC_PROG_RANLIB
+dnl --------------------------------------------------------------------
+
+AC_CANONICAL_HOST
+AC_EXEEXT
+
+# Decide whether to use libtool,
+# and if so whether to build shared, static, or both flavors of library.
+AC_DISABLE_SHARED
+AC_DISABLE_STATIC
+if test "x$enable_shared" != xno  -o  "x$enable_static" != xno; then
+  USELIBTOOL="yes"
+# LIBTOOL="./libtool"
+  O="lo"
+  A="la"
+  LN='$(LIBTOOL) --mode=link $(CC)'
+  INSTALL_LIB='$(LIBTOOL) --mode=install ${INSTALL}'
+  INSTALL_PROGRAM="\$(LIBTOOL) --mode=install $INSTALL_PROGRAM"
+  UNINSTALL='$(LIBTOOL) --mode=uninstall $(RM)'
+else
+  USELIBTOOL="no"
+  LIBTOOL=""
+  O="o"
+  A="a"
+  LN='$(CC)'
+  INSTALL_LIB="$INSTALL_DATA"
+  UNINSTALL='$(RM)'
+fi
+AC_SUBST([LIBTOOL])
+AC_SUBST([O])
+AC_SUBST([A])
+AC_SUBST([LN])
+AC_SUBST([INSTALL_LIB])
+AC_SUBST([UNINSTALL])
+
+# Configure libtool if needed.
+if test $USELIBTOOL = yes; then
+  AC_LIBTOOL_DLOPEN
+  AC_LIBTOOL_WIN32_DLL
+  AC_PROG_LIBTOOL
+fi
+# if libtool >= 1.5
+TAGCC=ifdef([AC_LIBTOOL_GCJ],[--tag=CC])
+AC_SUBST([TAGCC])
+
+dnl --------------------------------------------------------------------
+# Select memory manager depending on user input.
+# If no "-enable-maxmem", use jmemnobs
+MEMORYMGR='jmemnobs.$(O)'
+MAXMEM="no"
+AC_ARG_ENABLE([maxmem],
+[  --enable-maxmem[=N]     enable use of temp files, set max mem usage to N MB],
+[MAXMEM="$enableval"])
+# support --with-maxmem for backwards compatibility with IJG V5.
+AC_ARG_WITH([maxmem],,[MAXMEM="$withval"])
+if test "x$MAXMEM" = xyes; then
+  MAXMEM=1
+fi
+if test "x$MAXMEM" != xno; then
+  if test -n "`echo $MAXMEM | sed 's/[[0-9]]//g'`"; then
+    AC_MSG_ERROR([non-numeric argument to --enable-maxmem])
+  fi
+  DEFAULTMAXMEM=`expr $MAXMEM \* 1048576`
+AC_DEFINE_UNQUOTED([DEFAULT_MAX_MEM],[${DEFAULTMAXMEM}])
+AC_MSG_CHECKING([for 'tmpfile()'])
+AC_TRY_LINK([#include <stdio.h>],[ FILE * tfile = tmpfile(); ],
+[AC_MSG_RESULT(yes)
+MEMORYMGR='jmemansi.$(O)'],
+[AC_MSG_RESULT(no)
+MEMORYMGR='jmemname.$(O)'
+AC_DEFINE([NEED_SIGNAL_CATCHER],)
+AC_MSG_CHECKING([for 'mktemp()'])
+AC_TRY_LINK(,[ char fname[80]; mktemp(fname); ],
+[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([NO_MKTEMP],)])])
+fi
+AC_SUBST([MEMORYMGR])
+
+dnl ====================================================================
+
+AC_MSG_CHECKING([to see if the host cpu type is i386 or compatible])
+case "$host_cpu" in
+  i*86 | x86 | ia32)
+    AC_MSG_RESULT(yes)
+  ;;
+  x86_64 | amd64 | aa64)
+    AC_MSG_RESULT([no (x86_64)])
+    AC_MSG_ERROR([Currently, this version of JPEG library cannot be compiled as 64-bit code. sorry.])
+  ;;
+  *)
+    AC_MSG_RESULT([no ("$host_cpu")])
+    AC_MSG_ERROR([This version of JPEG library is for i386 or compatible processors only.])
+  ;;
+esac
+
+if test -z "$NAFLAGS" ; then
+  AC_MSG_CHECKING([for object file format of host system])
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      objfmt='Win32-COFF'
+    ;;
+    msdosdjgpp* | go32*)
+      objfmt='COFF'
+    ;;
+    os2-emx*)			# not tested
+      objfmt='MSOMF'		# obj
+    ;;
+    linux*coff* | linux*oldld*)
+      objfmt='COFF'		# ???
+    ;;
+    linux*aout*)
+      objfmt='a.out'
+    ;;
+    linux*)
+      objfmt='ELF'
+    ;;
+    freebsd* | netbsd* | openbsd*)
+      if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+        objfmt='BSD-a.out'
+      else
+        objfmt='ELF'
+      fi
+    ;;
+    solaris* | sunos* | sysv* | sco*)
+      objfmt='ELF'
+    ;;
+    darwin* | rhapsody* | nextstep* | openstep* | macos*)
+      objfmt='Mach-O'
+    ;;
+    *)
+      objfmt='ELF ?'
+    ;;
+  esac
+  AC_MSG_RESULT([$objfmt])
+  if test "$objfmt" = 'ELF ?'; then
+    objfmt='ELF'
+    AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+  fi
+else
+  objfmt=''
+fi
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+dnl --------------------------------------------------------------------
+
+AC_CHECK_PROGS(NASM, [nasm nasmw])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found in \$PATH])
+if echo "$NASM" | grep yasm > /dev/null; then
+  AC_MSG_WARN([DON'T USE YASM! CURRENT VERSION (R0.4.0) IS BUGGY!])
+fi
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        bits    32
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+AC_MSG_CHECKING([whether the assembler supports line continuation character])
+cat > conftest.asm <<\EOF
+[%line __oline__ "configure"
+; The line continuation character '\'
+; was introduced in nasm 0.98.25.
+        section .text
+        bits    32
+        global  _zero
+_zero:  xor     \
+                eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([you have to use a more recent version of the assembler.])
+fi
+
+dnl --------------------------------------------------------------------
+
+AC_MSG_CHECKING([SIMD instruction sets requested to use])
+simd_to_use=""
+
+AC_ARG_ENABLE(mmx,
+[  --disable-mmx           do not use MMX instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_MMX_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use MMX"
+fi], [simd_to_use="$simd_to_use MMX"])
+
+AC_ARG_ENABLE(3dnow,
+[  --disable-3dnow         do not use 3DNow! instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_3DNOW_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use 3DNow!"
+fi], [simd_to_use="$simd_to_use 3DNow!"])
+
+AC_ARG_ENABLE(sse,
+[  --disable-sse           do not use SSE instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_SSE_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use SSE"
+fi], [simd_to_use="$simd_to_use SSE"])
+
+AC_ARG_ENABLE(sse2,
+[  --disable-sse2          do not use SSE2 instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_SSE2_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use SSE2"
+fi], [simd_to_use="$simd_to_use SSE2"])
+
+test -z "$simd_to_use" && simd_to_use="NONE"
+AC_MSG_RESULT([$simd_to_use])
+
+for simd_name in $simd_to_use; do
+case "$simd_name" in
+  MMX)    simd_instruction='psubw mm0,mm0';;
+  3DNow!) simd_instruction='pfsub mm0,mm0';;
+  SSE)    simd_instruction='subps xmm0,xmm0';;
+  SSE2)   simd_instruction='subpd xmm0,xmm0';;
+  *)      continue;;
+esac
+AC_MSG_CHECKING([whether the assembler supports $simd_name instructions])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        bits    32
+        global  _simd
+_simd:  $simd_instruction
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([you have to use a more recent version of the assembler.])
+fi
+done
+
+dnl --------------------------------------------------------------------
+# Select OS-dependent SIMD instruction support checker.
+# jsimdw32.$(O) (Win32) / jsimddjg.$(O) (DJGPP V.2) / jsimdgcc.$(O) (Unix/gcc)
+if test "x$SIMDCHECKER" = x ; then
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      SIMDCHECKER='jsimdw32.$(O)'
+    ;;
+    msdosdjgpp* | go32*)
+      SIMDCHECKER='jsimddjg.$(O)'
+    ;;
+    os2-emx*)			# not tested
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+    *)
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+  esac
+fi
+AC_SUBST([SIMDCHECKER])
+
+case "$host_os" in
+  cygwin* | mingw* | pw32* | os2-emx* | msdosdjgpp* | go32*)
+    AC_DEFINE([USE_SETMODE],)
+  ;;
+# _host_name_*)
+#   AC_DEFINE([USE_FDOPEN],)
+# ;;
+esac
+
+# This is for UNIX-like environments on Windows platform.
+AC_ARG_ENABLE(uchar-boolean,
+[  --enable-uchar-boolean  define type \"boolean\" as unsigned char (for Windows)],
+[if test "x$enableval" != xno; then
+  AC_DEFINE([TYPEDEF_UCHAR_BOOLEAN],)
+fi])
+
+dnl --------------------------------------------------------------------
+
+JPEG_LIB_VERSION="63:0:1"
+confv_dirs="$srcdir $srcdir/.. $srcdir/../.."
+config_ver=
+for ac_dir in $confv_dirs; do
+  if test -r $ac_dir/config.ver; then
+    config_ver=$ac_dir/config.ver
+    break
+  fi
+done
+if test -z "$config_ver"; then
+  AC_MSG_WARN([cannot find config.ver in $confv_dirs])
+  AC_MSG_WARN([default version number $JPEG_LIB_VERSION is used])
+  AC_MSG_CHECKING([libjpeg version number for libtool])
+  AC_MSG_RESULT([$JPEG_LIB_VERSION])
+else
+  AC_MSG_CHECKING([libjpeg version number for libtool])
+  . $config_ver
+  AC_MSG_RESULT([$JPEG_LIB_VERSION])
+  echo "configure: if you want to change the version number, modify $config_ver" 1>&2
+fi
+AC_SUBST([JPEG_LIB_VERSION])
+
+dnl --------------------------------------------------------------------
+# Prepare to massage makefile.cfg correctly.
+if test $ijg_cv_have_prototypes = yes; then
+  A2K_DEPS=""
+  COM_A2K="# "
+else
+  A2K_DEPS="ansi2knr"
+  COM_A2K=""
+fi
+AC_SUBST([A2K_DEPS])
+AC_SUBST([COM_A2K])
+# ansi2knr needs -DBSD if string.h is missing
+if test $ac_cv_header_string_h = no; then
+  ANSI2KNRFLAGS="-DBSD"
+else
+  ANSI2KNRFLAGS=""
+fi
+AC_SUBST([ANSI2KNRFLAGS])
+# Substitutions to enable or disable libtool-related stuff
+if test $USELIBTOOL = yes -a $ijg_cv_have_prototypes = yes; then
+  COM_LT=""
+else
+  COM_LT="# "
+fi
+AC_SUBST([COM_LT])
+if test "x$enable_shared" != xno; then
+  FORCE_INSTALL_LIB="install-lib"
+  UNINSTALL_LIB="uninstall-lib"
+else
+  FORCE_INSTALL_LIB=""
+  UNINSTALL_LIB=""
+fi
+AC_SUBST([FORCE_INSTALL_LIB])
+AC_SUBST([UNINSTALL_LIB])
+# Set up -I directives
+if test "x$srcdir" = x.; then
+  INCLUDEFLAGS='-I$(srcdir)'
+else
+  INCLUDEFLAGS='-I. -I$(srcdir)'
+fi
+AC_SUBST([INCLUDEFLAGS])
+dnl --------------------------------------------------------------------
+AC_OUTPUT([Makefile:makefile.cfg])
--- a/djpeg.c
+++ b/djpeg.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : August 23, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains a command-line user interface for the JPEG decompressor.
 * It should work on any system with Unix- or MS-DOS-style command lines.
 *
@@ -158,6 +165,22 @@ usage (void)
 }


+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
 LOCAL(int)
 parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
 		int last_file_arg_seen, boolean for_real)
@@ -208,6 +231,19 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
      cinfo->desired_number_of_colors = val;
      cinfo->quantize_colors = TRUE;

+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
    } else if (keymatch(arg, "dct", 2)) {
      /* Select IDCT algorithm. */
      if (++argn >= argc)	/* advance to next argument */
@@ -242,6 +278,38 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
      if (! printed_version) {
 	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
 		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT));
+#endif
+#ifdef IDCT_SCALING_SUPPORTED
+	print_simd_info(stderr, "Reduced-size DCT      (-scale M/N) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT+1));
+#endif
+	print_simd_info(stderr, "High-quality upsampling (default)  :",
+			jpeg_simd_upsampler(cinfo, TRUE));
+	print_simd_info(stderr, "Low-quality upsampling (-nosmooth) :",
+			jpeg_simd_upsampler(cinfo, FALSE));
+	print_simd_info(stderr, "Colorspace conversion (YCbCr->RGB) :",
+			jpeg_simd_color_deconverter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 	printed_version = TRUE;
      }
      cinfo->err->trace_level++;
--- a/437
+++ b/437
@@ -1,19 +1,38 @@
 #!/bin/sh
-#
 # install - install a program, script, or datafile
-# This comes from X11R5 (mit/util/scripts/install.sh).
+
+scriptversion=2005-05-14.22
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
 #
-# Copyright 1991 by the Massachusetts Institute of Technology
+# Copyright (C) 1994 X Consortium
 #
-# Permission to use, copy, modify, distribute, and sell this software and its
-# documentation for any purpose is hereby granted without fee, provided that
-# the above copyright notice appear in all copies and that both that
-# copyright notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in advertising or
-# publicity pertaining to distribution of the software without specific,
-# written prior permission.  M.I.T. makes no representations about the
-# suitability of this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
 #
 # Calling this script install-sh is preferred over install.sh, to prevent
 # `make' implicit rules from creating a file called install from it
@@ -23,13 +42,11 @@
 # from scratch.  It can only install one file at a time, a restriction
 # shared with many OS's install programs.

-
 # set DOITPROG to echo to test this script

 # Don't use :- since 4.3BSD and earlier shells don't like it.
 doit="${DOITPROG-}"

-
 # put in absolute paths if you don't have them in your path; or use env. vars.

 mvprog="${MVPROG-mv}"
@@ -41,210 +58,266 @@ stripprog="${STRIPPROG-strip}"
 rmprog="${RMPROG-rm}"
 mkdirprog="${MKDIRPROG-mkdir}"

-transformbasename=""
-transform_arg=""
-instcmd="$mvprog"
 chmodcmd="$chmodprog 0755"
-chowncmd=""
-chgrpcmd=""
-stripcmd=""
+chowncmd=
+chgrpcmd=
+stripcmd=
 rmcmd="$rmprog -f"
 mvcmd="$mvprog"
-src=""
-dst=""
-dir_arg=""
+src=
+dst=
+dir_arg=
+dstarg=
+no_target_directory=

-while [ x"$1" != x ]; do
-    case $1 in
-	-c) instcmd="$cpprog"
-	    shift
-	    continue;;
+usage="Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...

-	-d) dir_arg=true
-	    shift
-	    continue;;
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.

-	-m) chmodcmd="$chmodprog $2"
-	    shift
-	    shift
-	    continue;;
+Options:
+-c         (ignored)
+-d         create directories instead of installing files.
+-g GROUP   $chgrpprog installed files to GROUP.
+-m MODE    $chmodprog installed files to MODE.
+-o USER    $chownprog installed files to USER.
+-s         $stripprog installed files.
+-t DIRECTORY  install into DIRECTORY.
+-T         report an error if DSTFILE is a directory.
+--help     display this help and exit.
+--version  display version info and exit.

-	-o) chowncmd="$chownprog $2"
-	    shift
-	    shift
-	    continue;;
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG
+"

-	-g) chgrpcmd="$chgrpprog $2"
-	    shift
-	    shift
-	    continue;;
+while test -n "$1"; do
+  case $1 in
+    -c) shift
+        continue;;

-	-s) stripcmd="$stripprog"
-	    shift
-	    continue;;
+    -d) dir_arg=true
+        shift
+        continue;;

-	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
-	    shift
-	    continue;;
+    -g) chgrpcmd="$chgrpprog $2"
+        shift
+        shift
+        continue;;

-	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
-	    shift
-	    continue;;
+    --help) echo "$usage"; exit $?;;

-	*)  if [ x"$src" = x ]
-	    then
-		src=$1
-	    else
-		# this colon is to work around a 386BSD /bin/sh bug
-		:
-		dst=$1
-	    fi
-	    shift
-	    continue;;
-    esac
-done
+    -m) chmodcmd="$chmodprog $2"
+        shift
+        shift
+        continue;;

-if [ x"$src" = x ]
-then
-	echo "install:	no input file specified"
-	exit 1
-else
-	true
-fi
+    -o) chowncmd="$chownprog $2"
+        shift
+        shift
+        continue;;

-if [ x"$dir_arg" != x ]; then
-	dst=$src
-	src=""
+    -s) stripcmd=$stripprog
+        shift
+        continue;;

-	if [ -d $dst ]; then
-		instcmd=:
-	else
-		instcmd=mkdir
-	fi
-else
-
-# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad 
-# if $src (and thus $dsttmp) contains '*'.
-
-	if [ -f $src -o -d $src ]
-	then
-		true
-	else
-		echo "install:  $src does not exist"
-		exit 1
-	fi
-	
-	if [ x"$dst" = x ]
-	then
-		echo "install:	no destination specified"
-		exit 1
-	else
-		true
-	fi
-
-# If destination is a directory, append the input filename; if your system
-# does not like double slashes in filenames, you may need to add some logic
-
-	if [ -d $dst ]
-	then
-		dst="$dst"/`basename $src`
-	else
-		true
-	fi
-fi
-
-## this sed command emulates the dirname command
-dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
-
-# Make sure that the destination directory exists.
-#  this part is taken from Noah Friedman's mkinstalldirs script
-
-# Skip lots of stat calls in the usual case.
-if [ ! -d "$dstdir" ]; then
-defaultIFS='	
-'
-IFS="${IFS-${defaultIFS}}"
-
-oIFS="${IFS}"
-# Some sh's can't handle IFS=/ for some reason.
-IFS='%'
-set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
-IFS="${oIFS}"
-
-pathcomp=''
-
-while [ $# -ne 0 ] ; do
-	pathcomp="${pathcomp}${1}"
+    -t) dstarg=$2
 	shift
+	shift
+	continue;;

-	if [ ! -d "${pathcomp}" ] ;
-        then
-		$mkdirprog "${pathcomp}"
-	else
-		true
-	fi
+    -T) no_target_directory=true
+	shift
+	continue;;

-	pathcomp="${pathcomp}/"
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    *)  # When -d is used, all remaining arguments are directories to create.
+	# When -t is used, the destination is already specified.
+	test -n "$dir_arg$dstarg" && break
+        # Otherwise, the last argument is the destination.  Remove it from $@.
+	for arg
+	do
+          if test -n "$dstarg"; then
+	    # $@ is not empty: it contains at least $arg.
+	    set fnord "$@" "$dstarg"
+	    shift # fnord
+	  fi
+	  shift # arg
+	  dstarg=$arg
+	done
+	break;;
+  esac
 done
+
+if test -z "$1"; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
 fi

-if [ x"$dir_arg" != x ]
-then
-	$doit $instcmd $dst &&
+for src
+do
+  # Protect names starting with `-'.
+  case $src in
+    -*) src=./$src ;;
+  esac

-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
-else
+  if test -n "$dir_arg"; then
+    dst=$src
+    src=

-# If we're going to rename the final executable, determine the name now.
+    if test -d "$dst"; then
+      mkdircmd=:
+      chmodcmd=
+    else
+      mkdircmd=$mkdirprog
+    fi
+  else
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi

-	if [ x"$transformarg" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		dstfile=`basename $dst $transformbasename | 
-			sed $transformarg`$transformbasename
-	fi
+    if test -z "$dstarg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi

-# don't allow the sed command to completely eliminate the filename
+    dst=$dstarg
+    # Protect names starting with `-'.
+    case $dst in
+      -*) dst=./$dst ;;
+    esac

-	if [ x"$dstfile" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		true
-	fi
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dstarg: Is a directory" >&2
+	exit 1
+      fi
+      dst=$dst/`basename "$src"`
+    fi
+  fi

-# Make a temp file name in the proper directory.
+  # This sed command emulates the dirname command.
+  dstdir=`echo "$dst" | sed -e 's,/*$,,;s,[^/]*$,,;s,/*$,,;s,^$,.,'`

-	dsttmp=$dstdir/#inst.$$#
+  # Make sure that the destination directory exists.

-# Move or copy the file name to the temp name
+  # Skip lots of stat calls in the usual case.
+  if test ! -d "$dstdir"; then
+    defaultIFS='
+	 '
+    IFS="${IFS-$defaultIFS}"

-	$doit $instcmd $src $dsttmp &&
+    oIFS=$IFS
+    # Some sh's can't handle IFS=/ for some reason.
+    IFS='%'
+    set x `echo "$dstdir" | sed -e 's@/@%@g' -e 's@^%@/@'`
+    shift
+    IFS=$oIFS

-	trap "rm -f ${dsttmp}" 0 &&
+    pathcomp=

-# and set any options; do chmod last to preserve setuid bits
+    while test $# -ne 0 ; do
+      pathcomp=$pathcomp$1
+      shift
+      if test ! -d "$pathcomp"; then
+        $mkdirprog "$pathcomp"
+	# mkdir can fail with a `File exist' error in case several
+	# install-sh are creating the directory concurrently.  This
+	# is OK.
+	test -d "$pathcomp" || exit
+      fi
+      pathcomp=$pathcomp/
+    done
+  fi

-# If any of these fail, we abort the whole thing.  If we want to
-# ignore errors from any of these, just make sure not to ignore
-# errors from the above "$doit $instcmd $src $dsttmp" command.
+  if test -n "$dir_arg"; then
+    $doit $mkdircmd "$dst" \
+      && { test -z "$chowncmd" || $doit $chowncmd "$dst"; } \
+      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } \
+      && { test -z "$stripcmd" || $doit $stripcmd "$dst"; } \
+      && { test -z "$chmodcmd" || $doit $chmodcmd "$dst"; }

-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+  else
+    dstfile=`basename "$dst"`

-# Now rename the file to the real destination.
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_

-	$doit $rmcmd -f $dstdir/$dstfile &&
-	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+    trap '(exit $?); exit' 1 2 13 15

-fi &&
+    # Copy the file name to the temp name.
+    $doit $cpprog "$src" "$dsttmp" &&

+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } \
+      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } \
+      && { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } \
+      && { test -z "$chmodcmd" || $doit $chmodcmd "$dsttmp"; } &&

-exit 0
+    # Now rename the file to the real destination.
+    { $doit $mvcmd -f "$dsttmp" "$dstdir/$dstfile" 2>/dev/null \
+      || {
+	   # The rename failed, perhaps because mv can't rename something else
+	   # to itself, or perhaps because mv is so ancient that it does not
+	   # support -f.
+
+	   # Now remove or move aside any old file at destination location.
+	   # We try this two ways since rm can't unlink itself on some
+	   # systems and the destination file might be busy for other
+	   # reasons.  In this case, the final cleanup might fail but the new
+	   # file should still install successfully.
+	   {
+	     if test -f "$dstdir/$dstfile"; then
+	       $doit $rmcmd -f "$dstdir/$dstfile" 2>/dev/null \
+	       || $doit $mvcmd -f "$dstdir/$dstfile" "$rmtmp" 2>/dev/null \
+	       || {
+		 echo "$0: cannot unlink or rename $dstdir/$dstfile" >&2
+		 (exit 1); exit 1
+	       }
+	     else
+	       :
+	     fi
+	   } &&
+
+	   # Now rename the file to the real destination.
+	   $doit $mvcmd "$dsttmp" "$dstdir/$dstfile"
+	 }
+    }
+  fi || { (exit 1); exit 1; }
+done
+
+# The final little trick to "correctly" pass the exit status to the exit trap.
+{
+  (exit 0); exit 0
+}
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
--- a/jccolmmx.asm
+++ b/jccolmmx.asm
@@ -0,0 +1,513 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_rgb_ycc_convert_mmx (j_compress_ptr cinfo,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_rgb_ycc_convert_mmx)
+
+EXTN(jpeg_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jcstruct_image_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JCCOLOR_RGBYCC_MMX_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
--- a/jccolor.c
+++ b/jccolor.c
@@ -5,12 +5,20 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file contains input colorspace conversion routines.
 */

 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */


 /* Private subobject */
@@ -352,6 +360,7 @@ GLOBAL(void)
 jinit_color_converter (j_compress_ptr cinfo)
 {
  my_cconvert_ptr cconvert;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -420,8 +429,23 @@ jinit_color_converter (j_compress_ptr cinfo)
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_RGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_ycc_convert;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_rgb_ycc_convert_sse2)) {
+        cconvert->pub.color_convert = jpeg_rgb_ycc_convert_sse2;
+      } else
+#endif
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+      if (simd & JSIMD_MMX) {
+        cconvert->pub.color_convert = jpeg_rgb_ycc_convert_mmx;
+      } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+      {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_ycc_convert;
+      }
    } else if (cinfo->in_color_space == JCS_YCbCr)
      cconvert->pub.color_convert = null_convert;
    else
@@ -457,3 +481,28 @@ jinit_color_converter (j_compress_ptr cinfo)
    break;
  }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_color_converter (j_compress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_rgb_ycc_convert_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jccolss2.asm
+++ b/jccolss2.asm
@@ -0,0 +1,541 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_rgb_ycc_convert_sse2 (j_compress_ptr cinfo,
+;                            JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                            JDIMENSION output_row, int num_rows);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_rgb_ycc_convert_sse2)
+
+EXTN(jpeg_rgb_ycc_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jcstruct_image_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, _DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, _MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, _DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, _MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [ebx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [edx], xmm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	add	ebx, byte SIZEOF_XMMWORD		; outptr1
+	add	edx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JCCOLOR_RGBYCC_SSE2_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : December 24, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains the forward-DCT management logic.
 * This code selects a particular DCT implementation to be used,
 * and it performs related housekeeping chores including coefficient
@@ -24,6 +31,8 @@ typedef struct {

  /* Pointer to the DCT routine actually in use */
  forward_DCT_method_ptr do_dct;
+  convsamp_int_method_ptr convsamp;
+  quantize_int_method_ptr quantize;

  /* The actual post-DCT divisors --- not identical to the quant table
   * entries, because of scaling (especially for an unnormalized DCT).
@@ -34,12 +43,75 @@ typedef struct {
 #ifdef DCT_FLOAT_SUPPORTED
  /* Same as above for the floating-point case. */
  float_DCT_method_ptr do_float_dct;
+  convsamp_float_method_ptr float_convsamp;
+  quantize_float_method_ptr float_quantize;
  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;

 typedef my_fdct_controller * my_fdct_ptr;

+/*
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
+ */
+
+#define ALIGN_SIZE	16		/* sizeof SSE/SSE2 register */
+#define ALIGN_MEM(p,a)	((void *) (((size_t) (p) + (a) - 1) & -(a)))
+
+#ifdef JFDCT_INT_QUANTIZE_WITH_DIVISION
+#undef jpeg_quantize_int
+#undef jpeg_quantize_int_mmx
+#undef jpeg_quantize_int_sse2
+#define jpeg_quantize_int       jpeg_quantize_idiv
+#define jpeg_quantize_int_mmx   jpeg_quantize_idiv
+#define jpeg_quantize_int_sse2  jpeg_quantize_idiv
+#endif
+
+
+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+/*
+ * SIMD Ext: compute the reciprocal of the divisor
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ */
+
+LOCAL(void)
+compute_reciprocal (DCTELEM divisor, DCTELEM * dtbl)
+{
+  unsigned long d = ((unsigned long) divisor) & 0x0000FFFF;
+  unsigned long fq, fr;
+  int b, r, c;
+
+  for (b = 0; (1UL << b) <= d; b++) ;
+
+  r  = 16 + (--b);
+  fq = (1UL << r) / d;
+  fr = (1UL << r) % d;
+  r -= 16;
+  c  = 0;
+
+  if (fr == 0) {
+    fq >>= 1;
+    r--;
+  } else if (fr <= (d / 2)) {
+    c++;
+  } else {
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;		/* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) (c + (d / 2));	/* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (16 - (r + 1 + 1)));	/* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) (r + 1);			/* shift */
+}
+
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
+

 /*
 * Initialize for a processing pass.
@@ -75,6 +147,18 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
      /* For LL&M IDCT method, divisors are equal to raw quantization
       * coefficients multiplied by 8 (to counteract scaling).
       */
+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+      if (fdct->divisors[qtblno] == NULL) {
+	fdct->divisors[qtblno] = (DCTELEM *)
+	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
+      }
+      dtbl = fdct->divisors[qtblno];
+      for (i = 0; i < DCTSIZE2; i++) {
+	compute_reciprocal ((DCTELEM) (qtbl->quantval[i] << 3), &dtbl[i]);
+      }
+      break;
+#else  /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
      if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -85,7 +169,8 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
      }
      break;
-#endif
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
    case JDCT_IFAST:
      {
@@ -109,6 +194,21 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	};
 	SHIFT_TEMPS

+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+	if (fdct->divisors[qtblno] == NULL) {
+	  fdct->divisors[qtblno] = (DCTELEM *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
+	}
+	dtbl = fdct->divisors[qtblno];
+	for (i = 0; i < DCTSIZE2; i++) {
+	  compute_reciprocal ((DCTELEM)
+			       DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
+						     (INT32) aanscales[i]),
+				       CONST_BITS-3),
+			      &dtbl[i]);
+	}
+#else  /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -121,9 +221,10 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 				  (INT32) aanscales[i]),
 		    CONST_BITS-3);
 	}
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
      }
      break;
-#endif
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
    case JDCT_FLOAT:
      {
@@ -183,83 +284,23 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 	     JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
-  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  DCTELEM workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(DCTELEM)];
+  DCTELEM * wkptr = (DCTELEM *) ALIGN_MEM(workspace, ALIGN_SIZE);
  JDIMENSION bi;

  sample_data += start_row;	/* fold in the vertical offset once */

  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
+    (*fdct->convsamp) (sample_data, start_col, wkptr);

    /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*fdct->do_dct) (wkptr);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
+    (*fdct->quantize) (coef_blocks[bi], divisors, wkptr);
  }
 }

@@ -273,64 +314,23 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		   JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
-  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  FAST_FLOAT workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(FAST_FLOAT)];
+  FAST_FLOAT * wkptr = (FAST_FLOAT *) ALIGN_MEM(workspace, ALIGN_SIZE);
  JDIMENSION bi;

  sample_data += start_row;	/* fold in the vertical offset once */

  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
+    (*fdct->float_convsamp) (sample_data, start_col, wkptr);

    /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*fdct->do_float_dct) (wkptr);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
+    (*fdct->float_quantize) (coef_blocks[bi], divisors, wkptr);
  }
 }

@@ -346,6 +346,7 @@ jinit_forward_dct (j_compress_ptr cinfo)
 {
  my_fdct_ptr fdct;
  int i;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  fdct = (my_fdct_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -357,21 +358,86 @@ jinit_forward_dct (j_compress_ptr cinfo)
 #ifdef DCT_ISLOW_SUPPORTED
  case JDCT_ISLOW:
    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_islow_sse2)) {
+      fdct->do_dct = jpeg_fdct_islow_sse2;
+      fdct->convsamp = jpeg_convsamp_int_sse2;
+      fdct->quantize = jpeg_quantize_int_sse2;
+    } else
 #endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      fdct->do_dct = jpeg_fdct_islow_mmx;
+      fdct->convsamp = jpeg_convsamp_int_mmx;
+      fdct->quantize = jpeg_quantize_int_mmx;
+    } else
+#endif
+    {
+      fdct->do_dct = jpeg_fdct_islow;
+      fdct->convsamp = jpeg_convsamp_int;
+      fdct->quantize = jpeg_quantize_int;
+    }
+    break;
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
  case JDCT_IFAST:
    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_ifast_sse2)) {
+      fdct->do_dct = jpeg_fdct_ifast_sse2;
+      fdct->convsamp = jpeg_convsamp_int_sse2;
+      fdct->quantize = jpeg_quantize_int_sse2;
+    } else
 #endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      fdct->do_dct = jpeg_fdct_ifast_mmx;
+      fdct->convsamp = jpeg_convsamp_int_mmx;
+      fdct->quantize = jpeg_quantize_int_mmx;
+    } else
+#endif
+    {
+      fdct->do_dct = jpeg_fdct_ifast;
+      fdct->convsamp = jpeg_convsamp_int;
+      fdct->quantize = jpeg_quantize_int;
+    }
+    break;
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
  case JDCT_FLOAT:
    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse)) {
+      fdct->do_float_dct = jpeg_fdct_float_sse;
+      fdct->float_convsamp = jpeg_convsamp_flt_sse2;
+      fdct->float_quantize = jpeg_quantize_flt_sse2;
+    } else
 #endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse)) {
+      fdct->do_float_dct = jpeg_fdct_float_sse;
+      fdct->float_convsamp = jpeg_convsamp_flt_sse;
+      fdct->float_quantize = jpeg_quantize_flt_sse;
+    } else
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW) {
+      fdct->do_float_dct = jpeg_fdct_float_3dnow;
+      fdct->float_convsamp = jpeg_convsamp_flt_3dnow;
+      fdct->float_quantize = jpeg_quantize_flt_3dnow;
+    } else
+#endif
+    {
+      fdct->do_float_dct = jpeg_fdct_float;
+      fdct->float_convsamp = jpeg_convsamp_float;
+      fdct->float_quantize = jpeg_quantize_float;
+    }
+    break;
+#endif /* DCT_FLOAT_SUPPORTED */
  default:
    ERREXIT(cinfo, JERR_NOT_COMPILED);
    break;
@@ -385,3 +451,65 @@ jinit_forward_dct (j_compress_ptr cinfo)
 #endif
  }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_forward_dct (j_compress_ptr cinfo, int method)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+  switch (method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_islow_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_ISLOW_SUPPORTED */
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_ifast_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_IFAST_SUPPORTED */
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_SSE2); */
+#endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_MMX); */
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW)
+      return JSIMD_3DNOW;	/* (JSIMD_3DNOW | JSIMD_MMX); */
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_FLOAT_SUPPORTED */
+  default:
+    ;
+  }
+
+  return JSIMD_NONE;	/* not compiled */
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jcolsamp.h
+++ b/jcolsamp.h
@@ -0,0 +1,143 @@
+/*
+ * jcolsamp.h - private declarations for color conversion & up/downsampling
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * Last Modified : February 4, 2006
+ *
+ * [TAB8]
+ */
+
+
+/* configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+ * valid setting on this SIMD extension.
+ */
+#if BITS_IN_JSAMPLE != 8
+#error "Sorry, this SIMD code only copes with 8-bit sample values."
+#endif
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_rgb_ycc_convert_mmx	jMRgbYccCnv	/* jccolmmx.asm */
+#define jpeg_rgb_ycc_convert_sse2	jSRgbYccCnv	/* jccolss2.asm */
+#define jpeg_h2v1_downsample_mmx	jM21Downsample	/* jcsammmx.asm */
+#define jpeg_h2v2_downsample_mmx	jM22Downsample	/* jcsammmx.asm */
+#define jpeg_h2v1_downsample_sse2	jS21Downsample	/* jcsamss2.asm */
+#define jpeg_h2v2_downsample_sse2	jS22Downsample	/* jcsamss2.asm */
+#define jpeg_ycc_rgb_convert_mmx	jMYccRgbCnv	/* jdcolmmx.asm */
+#define jpeg_ycc_rgb_convert_sse2	jSYccRgbCnv	/* jdcolss2.asm */
+#define jpeg_h2v1_merged_upsample_mmx	jM21MerUpsample	/* jdmermmx.asm */
+#define jpeg_h2v2_merged_upsample_mmx	jM22MerUpsample	/* jdmermmx.asm */
+#define jpeg_h2v1_merged_upsample_sse2	jS21MerUpsample	/* jdmerss2.asm */
+#define jpeg_h2v2_merged_upsample_sse2	jS22MerUpsample	/* jdmerss2.asm */
+#define jpeg_h2v1_fancy_upsample_mmx	jM21FanUpsample	/* jdsammmx.asm */
+#define jpeg_h2v2_fancy_upsample_mmx	jM22FanUpsample	/* jdsammmx.asm */
+#define jpeg_h1v2_fancy_upsample_mmx	jM12FanUpsample	/* jdsammmx.asm */
+#define jpeg_h2v1_upsample_mmx		jM21Upsample	/* jdsammmx.asm */
+#define jpeg_h2v2_upsample_mmx		jM22Upsample	/* jdsammmx.asm */
+#define jpeg_h2v1_fancy_upsample_sse2	jS21FanUpsample	/* jdsamss2.asm */
+#define jpeg_h2v2_fancy_upsample_sse2	jS22FanUpsample	/* jdsamss2.asm */
+#define jpeg_h1v2_fancy_upsample_sse2	jS12FanUpsample	/* jdsamss2.asm */
+#define jpeg_h2v1_upsample_sse2		jS21Upsample	/* jdsamss2.asm */
+#define jpeg_h2v2_upsample_sse2		jS22Upsample	/* jdsamss2.asm */
+#define jconst_rgb_ycc_convert_mmx	jMCRgbYccCnv	/* jccolmmx.asm */
+#define jconst_rgb_ycc_convert_sse2	jSCRgbYccCnv	/* jccolss2.asm */
+#define jconst_ycc_rgb_convert_mmx	jMCYccRgbCnv	/* jdcolmmx.asm */
+#define jconst_ycc_rgb_convert_sse2	jSCYccRgbCnv	/* jdcolss2.asm */
+#define jconst_merged_upsample_mmx	jMCMerUpsample	/* jdmermmx.asm */
+#define jconst_merged_upsample_sse2	jSCMerUpsample	/* jdmerss2.asm */
+#define jconst_fancy_upsample_mmx	jMCFanUpsample	/* jdsammmx.asm */
+#define jconst_fancy_upsample_sse2	jSCFanUpsample	/* jdsamss2.asm */
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+#define jpeg_simd_merged_upsampler	jSiMUpsampler	/* jdmerge.c    */
+#endif
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* Extern declarations for color conversion & up/downsampling routines. */
+
+EXTERN(void) jpeg_rgb_ycc_convert_mmx
+    JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+	 JDIMENSION output_row, int num_rows));
+EXTERN(void) jpeg_rgb_ycc_convert_sse2
+    JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+	 JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jpeg_h2v1_downsample_mmx
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v2_downsample_mmx
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v1_downsample_sse2
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v2_downsample_sse2
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jpeg_ycc_rgb_convert_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+	 JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jpeg_ycc_rgb_convert_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+	 JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(void) jpeg_h2v1_merged_upsample_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v2_merged_upsample_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v1_merged_upsample_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v2_merged_upsample_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jpeg_h2v1_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h1v2_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h1v2_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_rgb_ycc_convert_mmx[];
+extern const int jconst_rgb_ycc_convert_sse2[];
+extern const int jconst_ycc_rgb_convert_mmx[];
+extern const int jconst_ycc_rgb_convert_sse2[];
+extern const int jconst_merged_upsample_mmx[];
+extern const int jconst_merged_upsample_sse2[];
+extern const int jconst_fancy_upsample_mmx[];
+extern const int jconst_fancy_upsample_sse2[];
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_merged_upsampler JPP((j_decompress_ptr cinfo));
+#endif
--- a/jcolsamp.inc
+++ b/jcolsamp.inc
@@ -0,0 +1,156 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; Last Modified : January 5, 2006
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+;
+; configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+; valid setting on this SIMD extension.
+;
+%if BITS_IN_JSAMPLE != 8
+%error "Sorry, this SIMD code only copes with 8-bit sample values."
+%endif
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_rgb_ycc_convert_mmx	jMRgbYccCnv	; jccolmmx.asm
+%define jpeg_rgb_ycc_convert_sse2	jSRgbYccCnv	; jccolss2.asm
+%define jpeg_h2v1_downsample_mmx	jM21Downsample	; jcsammmx.asm
+%define jpeg_h2v2_downsample_mmx	jM22Downsample	; jcsammmx.asm
+%define jpeg_h2v1_downsample_sse2	jS21Downsample	; jcsamss2.asm
+%define jpeg_h2v2_downsample_sse2	jS22Downsample	; jcsamss2.asm
+%define jpeg_ycc_rgb_convert_mmx	jMYccRgbCnv	; jdcolmmx.asm
+%define jpeg_ycc_rgb_convert_sse2	jSYccRgbCnv	; jdcolss2.asm
+%define jpeg_h2v1_merged_upsample_mmx	jM21MerUpsample	; jdmermmx.asm
+%define jpeg_h2v2_merged_upsample_mmx	jM22MerUpsample	; jdmermmx.asm
+%define jpeg_h2v1_merged_upsample_sse2	jS21MerUpsample	; jdmerss2.asm
+%define jpeg_h2v2_merged_upsample_sse2	jS22MerUpsample	; jdmerss2.asm
+%define jpeg_h2v1_fancy_upsample_mmx	jM21FanUpsample	; jdsammmx.asm
+%define jpeg_h2v2_fancy_upsample_mmx	jM22FanUpsample	; jdsammmx.asm
+%define jpeg_h1v2_fancy_upsample_mmx	jM12FanUpsample	; jdsammmx.asm
+%define jpeg_h2v1_upsample_mmx		jM21Upsample	; jdsammmx.asm
+%define jpeg_h2v2_upsample_mmx		jM22Upsample	; jdsammmx.asm
+%define jpeg_h2v1_fancy_upsample_sse2	jS21FanUpsample	; jdsamss2.asm
+%define jpeg_h2v2_fancy_upsample_sse2	jS22FanUpsample	; jdsamss2.asm
+%define jpeg_h1v2_fancy_upsample_sse2	jS12FanUpsample	; jdsamss2.asm
+%define jpeg_h2v1_upsample_sse2		jS21Upsample	; jdsamss2.asm
+%define jpeg_h2v2_upsample_sse2		jS22Upsample	; jdsamss2.asm
+%define jconst_rgb_ycc_convert_mmx	jMCRgbYccCnv	; jccolmmx.asm
+%define jconst_rgb_ycc_convert_sse2	jSCRgbYccCnv	; jccolss2.asm
+%define jconst_ycc_rgb_convert_mmx	jMCYccRgbCnv	; jdcolmmx.asm
+%define jconst_ycc_rgb_convert_sse2	jSCYccRgbCnv	; jdcolss2.asm
+%define jconst_merged_upsample_mmx	jMCMerUpsample	; jdmermmx.asm
+%define jconst_merged_upsample_sse2	jSCMerUpsample	; jdmerss2.asm
+%define jconst_fancy_upsample_mmx	jMCFanUpsample	; jdsammmx.asm
+%define jconst_fancy_upsample_sse2	jSCFanUpsample	; jdsamss2.asm
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%if RGB_RED < 0 || RGB_RED >= RGB_PIXELSIZE || RGB_GREEN < 0 || \
+   RGB_GREEN >= RGB_PIXELSIZE || RGB_BLUE < 0 || RGB_BLUE >= RGB_PIXELSIZE || \
+   RGB_RED == RGB_GREEN || RGB_GREEN == RGB_BLUE || RGB_RED == RGB_BLUE
+%error "Incorrect RGB pixel offset."
+%endif
+
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+
+; --------------------------------------------------------------------------
--- a/jcomapi.c
+++ b/jcomapi.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 11, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains application interface routines that are used for both
 * compression and decompression.
 */
@@ -104,3 +111,54 @@ jpeg_alloc_huff_table (j_common_ptr cinfo)
  tbl->sent_table = FALSE;	/* make sure this is false in any new table */
  return tbl;
 }
+
+
+/*
+ * SIMD Ext: Checking for support of SIMD instruction set.
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_support (j_common_ptr cinfo)
+{
+  enum { JSIMD_INVALID = ~0 };
+  static volatile unsigned int simd_supported = JSIMD_INVALID;
+
+  if (simd_supported == JSIMD_INVALID)
+    simd_supported = jpeg_simd_os_support(jpeg_simd_cpu_support());
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+  if (cinfo != NULL)	/* Turn off the masked flags */
+    return simd_supported & ~jpeg_simd_mask(cinfo, JSIMD_NONE, JSIMD_NONE);
+#endif
+  return simd_supported;
+}
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+
+/*
+ * SIMD Ext: modify/retrieve SIMD instruction mask
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_mask (j_common_ptr cinfo, unsigned int remove, unsigned int add)
+{
+  unsigned long *gp;
+  unsigned int oldmask;
+
+  if (cinfo->is_decompressor)
+    gp = (unsigned long *) &((j_decompress_ptr) cinfo)->output_gamma;
+  else	/* compressor */
+    gp = (unsigned long *) &((j_compress_ptr) cinfo)->input_gamma;
+
+  if ((gp[1] == 0x3FF00000 || gp[1] == 0x00000000) &&	/* +1.0 or +0.0 */
+      (gp[0] & ~JSIMD_ALL) == 0) {
+    oldmask = gp[0];
+    if (((remove | add) & ~JSIMD_ALL) == 0)
+      gp[0] = (oldmask & ~remove) | add;
+  } else {
+    oldmask = 0;	/* error */
+  }
+  return oldmask;
+}
+
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
--- a/jconfig.bc5
+++ b/jconfig.bc5
@@ -0,0 +1,48 @@
+/* jconfig.bc5 --- jconfig.h for Borland C++ Compiler 5.5 (win32) */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS	/* we presume a 32-bit flat memory model */
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN	/* this assumes you have -w-stu in CFLAGS */
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#define TYPEDEF_UCHAR_BOOLEAN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#define TWO_FILE_COMMANDLINE
+#define USE_SETMODE		/* Borland has setmode() */
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
--- a/jconfig.cfg
+++ b/jconfig.cfg
@@ -16,6 +16,9 @@
 /* Define this if you get warnings about undefined structures. */
 #undef INCOMPLETE_TYPES_BROKEN

+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#undef TYPEDEF_UCHAR_BOOLEAN
+
 #ifdef JPEG_INTERNALS

 #undef RIGHT_SHIFT_IS_UNSIGNED
@@ -26,6 +29,13 @@

 #endif /* JPEG_INTERNALS */

+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG

 #define BMP_SUPPORTED		/* BMP image file format */
@@ -35,6 +45,8 @@
 #define TARGA_SUPPORTED		/* Targa image file format */

 #undef TWO_FILE_COMMANDLINE
+#undef USE_SETMODE
+#undef USE_FDOPEN
 #undef NEED_SIGNAL_CATCHER
 #undef DONT_USE_B_MODE

--- a/jconfig.dj
+++ b/jconfig.dj
@@ -21,6 +21,13 @@

 #endif /* JPEG_INTERNALS */

+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG

 #define BMP_SUPPORTED		/* BMP image file format */
@@ -35,4 +42,6 @@
 #undef DONT_USE_B_MODE
 #undef PROGRESS_REPORT		/* optional */

+#define FREE_MEM_ESTIMATE	0	/* for alternate cjpeg/djpeg */
+
 #endif /* JPEG_CJPEG_DJPEG */
--- a/jconfig.linux
+++ b/jconfig.linux
@@ -0,0 +1,44 @@
+/* jconfig.linux --- jconfig.h for Linux ELF with gcc */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#undef TWO_FILE_COMMANDLINE
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
--- a/jconfig.mgw
+++ b/jconfig.mgw
@@ -0,0 +1,48 @@
+/* jconfig.mgw --- jconfig.h for MinGW */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#define TYPEDEF_UCHAR_BOOLEAN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#define TWO_FILE_COMMANDLINE	/* optional */
+#define USE_SETMODE		/* MinGW has setmode() */
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
--- a/jconfig.vc
+++ b/jconfig.vc
@@ -16,11 +16,7 @@
 #undef INCOMPLETE_TYPES_BROKEN

 /* Define "boolean" as unsigned char, not int, per Windows custom */
-#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
-typedef unsigned char boolean;
-#endif
-#define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
-
+#define TYPEDEF_UCHAR_BOOLEAN

 #ifdef JPEG_INTERNALS

@@ -28,6 +24,13 @@ typedef unsigned char boolean;

 #endif /* JPEG_INTERNALS */

+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG

 #define BMP_SUPPORTED		/* BMP image file format */
--- a/jcqnt3dn.asm
+++ b/jcqnt3dn.asm
@@ -0,0 +1,240 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_3dnow)
+
+EXTN(jpeg_convsamp_flt_3dnow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	pi2fd	mm4,mm4
+	pi2fd	mm2,mm2
+	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	pi2fd	mm5,mm5
+	pi2fd	mm0,mm0
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	pi2fd	mm6,mm6
+	pi2fd	mm3,mm3
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	pi2fd	mm4,mm4
+	pi2fd	mm1,mm1
+
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                          FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_3dnow)
+
+EXTN(jpeg_quantize_flt_3dnow):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
+	movd      mm7,eax
+	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
+	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
+	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
+	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
+	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
+	movq      mm5,mm2
+	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
+	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
+
+	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
+	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
+
+	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
+	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
+	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
+	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
+
+	movq      mm5,mm6
+	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
+	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
+	movq      mm1,mm3
+	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
+	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
+
+	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
+	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jcqntflt.asm
+++ b/jcqntflt.asm
@@ -0,0 +1,202 @@
+;
+; jcqntflt.asm - sample data conversion and quantization (non-SIMD, FP)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : March 21, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_float)
+
+EXTN(jpeg_convsamp_float):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi]		; (JSAMPLE *)
+	add	ebx, JDIMENSION [start_col]
+
+%assign i 0	; i=0
+%rep 4	; -- repeat 4 times ---
+	xor	eax,eax
+	xor	edx,edx
+	mov	al, JSAMPLE [ebx+(i+0)*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [ebx+(i+1)*SIZEOF_JSAMPLE]
+	add	eax, byte -CENTERJSAMPLE
+	add	edx, byte -CENTERJSAMPLE
+	push	eax
+	push	edx
+%assign i i+2	; i+=2
+%endrep	; -- repeat end ---
+
+	fild	INT32 [esp+0*SIZEOF_INT32]
+	fild	INT32 [esp+1*SIZEOF_INT32]
+	fild	INT32 [esp+2*SIZEOF_INT32]
+	fild	INT32 [esp+3*SIZEOF_INT32]
+	fild	INT32 [esp+4*SIZEOF_INT32]
+	fild	INT32 [esp+5*SIZEOF_INT32]
+	fild	INT32 [esp+6*SIZEOF_INT32]
+	fild	INT32 [esp+7*SIZEOF_INT32]
+
+	add	esp, byte DCTSIZE*SIZEOF_INT32
+
+	fstp	FAST_FLOAT [edi+0*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+1*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+2*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+3*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+4*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+5*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+6*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+7*SIZEOF_FAST_FLOAT]
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	edi, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                      FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+%define FLT_ROUNDS	1		; from <float.h>
+
+	align	16
+	global	EXTN(jpeg_quantize_float)
+
+EXTN(jpeg_quantize_float):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; unused
+;	push	edx		; unused
+	push	esi
+	push	edi
+
+%if (FLT_ROUNDS != 1)
+	push	eax
+	fnstcw	word [esp]
+	mov	eax, [esp]
+	and	eax, (~0x0C00)		; round to nearest integer
+	push	eax
+	fldcw	word [esp]
+	pop	eax
+%endif
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/8
+	alignx	16,7
+.quantloop:
+	fld	FAST_FLOAT [esi+0*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+0*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+1*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+1*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+2*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+2*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+3*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+3*SIZEOF_FAST_FLOAT]
+
+	fld	FAST_FLOAT [esi+4*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+4*SIZEOF_FAST_FLOAT]
+	fxch	st0,st1
+	fld	FAST_FLOAT [esi+5*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+5*SIZEOF_FAST_FLOAT]
+	fxch	st0,st3
+	fld	FAST_FLOAT [esi+6*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+6*SIZEOF_FAST_FLOAT]
+	fxch	st0,st5
+	fld	FAST_FLOAT [esi+7*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+7*SIZEOF_FAST_FLOAT]
+	fxch	st0,st7
+
+	fistp	JCOEF [edi+0*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+1*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+2*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+3*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+4*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+5*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+6*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+7*SIZEOF_JCOEF]
+
+	add	esi, byte 8*SIZEOF_FAST_FLOAT
+	add	ebx, byte 8*SIZEOF_FAST_FLOAT
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+%if (FLT_ROUNDS != 1)
+	fldcw	word [esp]
+	pop	eax		; pop old control word
+%endif
+	pop	edi
+	pop	esi
+;	pop	edx		; unused
+;	pop	ecx		; unused
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jcqntint.asm
+++ b/jcqntint.asm
@@ -0,0 +1,243 @@
+;
+; jcqntint.asm - sample data conversion and quantization (non-SIMD, integer)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int)
+
+EXTN(jpeg_convsamp_int):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi]		; (JSAMPLE *)
+	add	ebx, JDIMENSION [start_col]
+
+%assign i 0	; i=0
+%rep 4	; -- repeat 4 times ---
+	xor	eax,eax
+	xor	edx,edx
+	mov	al, JSAMPLE [ebx+(i+0)*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [ebx+(i+1)*SIZEOF_JSAMPLE]
+	add	eax, byte -CENTERJSAMPLE
+	add	edx, byte -CENTERJSAMPLE
+	mov	DCTELEM [edi+(i+0)*SIZEOF_DCTELEM], ax
+	mov	DCTELEM [edi+(i+1)*SIZEOF_DCTELEM], dx
+%assign i i+2	; i+=2
+%endrep	; -- repeat end ---
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	edi, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int (JCOEFPTR coef_block, DCTELEM * divisors,
+;                    DCTELEM * workspace);
+;
+
+%define RECIPROCAL(i,b)	((b)+((i)+DCTSIZE2*0)*SIZEOF_DCTELEM)
+%define CORRECTION(i,b)	((b)+((i)+DCTSIZE2*1)*SIZEOF_DCTELEM)
+%define SHIFT(i,b)	((b)+((i)+DCTSIZE2*3)*SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+%define UNROLL	2
+
+	align	16
+	global	EXTN(jpeg_quantize_int)
+
+EXTN(jpeg_quantize_int):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ecx, DCTSIZE2/UNROLL
+	alignx	16,7
+.quantloop:
+	push	ecx
+
+%assign i 0	; i=0;
+%rep UNROLL	; ---- repeat (UNROLL) times ----
+	mov	cx, DCTELEM [esi+(i)*SIZEOF_DCTELEM]
+	mov	ax,cx
+	sar	cx,(WORD_BIT-1)
+	xor	ax,cx		; if (ax < 0) ax = -ax;
+	sub	ax,cx
+	add	ax, DCTELEM [CORRECTION(i,ebx)]	; correction + roundfactor
+	shl	ax,1
+	mul	DCTELEM [RECIPROCAL(i,ebx)]	; reciprocal
+	mov	ax,cx
+	mov	cx, DCTELEM [SHIFT(i,ebx)]	; shift
+	shr	dx,cl
+	xor	dx,ax
+	sub	dx,ax
+	mov	JCOEF [edi+(i)*SIZEOF_JCOEF], dx
+%assign i i+1	; i++;
+%endrep		; ---- repeat end ----
+
+	pop	ecx
+
+	add	esi, byte UNROLL*SIZEOF_DCTELEM
+	add	ebx, byte UNROLL*SIZEOF_DCTELEM
+	add	edi, byte UNROLL*SIZEOF_JCOEF
+	dec	ecx
+	jnz	.quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%else ; JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_idiv (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_idiv)
+
+EXTN(jpeg_quantize_idiv):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ecx, DCTSIZE2
+	alignx	16,7
+.quantloop:
+	push	ecx
+
+	movsx	ecx, DCTELEM [esi]	; temp
+	mov	eax,ecx
+	sar	ecx,(DWORD_BIT-1)
+	xor	edx,edx
+	mov	dx, DCTELEM [ebx]	; qval
+	xor	eax,ecx			; if (eax < 0) eax = -eax;
+	shr	edx,1
+	sub	eax,ecx
+	cmp	eax,edx			; if (temp + qval/2 >= qval)
+	jge	short .quant
+	; ---- if the quantized coefficient is zero
+	xor	eax,eax
+	jmp	short .output
+	alignx	16,7
+.quant:	; ---- do quantization
+	add	eax,edx
+	xor	edx,edx
+	div	DCTELEM [ebx]		; Q:ax,R:dx
+	xor	ax,cx
+	sub	ax,cx
+	alignx	16,7
+.output:
+	mov	JCOEF [edi], ax
+
+	pop	ecx
+
+	add	esi, byte SIZEOF_DCTELEM
+	add	ebx, byte SIZEOF_DCTELEM
+	add	edi, byte SIZEOF_JCOEF
+	dec	ecx
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
--- a/jcqntmmx.asm
+++ b/jcqntmmx.asm
@@ -0,0 +1,254 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                        DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int_mmx)
+
+EXTN(jpeg_convsamp_int_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                        DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_int_mmx)
+
+EXTN(jpeg_quantize_int_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+	movq	mm0,mm2
+	movq	mm1,mm3
+	psraw	mm2,(WORD_BIT-1)
+	psraw	mm3,(WORD_BIT-1)
+	pxor	mm0,mm2
+	pxor	mm1,mm3
+	psubw	mm0,mm2		; if (mm0 < 0) mm0 = -mm0;
+	psubw	mm1,mm3		; if (mm1 < 0) mm1 = -mm1;
+
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+	psllw	mm0,1
+	psllw	mm1,1
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1)
+	paddw	mm1,mm5
+	psllw	mm0,1
+	psllw	mm1,1
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+	psraw	mm6,(WORD_BIT-1)
+	psraw	mm7,(WORD_BIT-1)
+	pand	mm6,mm4
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+	psraw	mm4,(WORD_BIT-1)
+	psraw	mm5,(WORD_BIT-1)
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; scale
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
+%endif ; JFDCT_INT_MMX_SUPPORTED
--- a/jcqnts2f.asm
+++ b/jcqnts2f.asm
@@ -0,0 +1,178 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 18, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                         FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_sse2)
+
+EXTN(jpeg_convsamp_flt_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_sse2)
+
+EXTN(jpeg_quantize_flt_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_SSE2_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jcqnts2i.asm
+++ b/jcqnts2i.asm
@@ -0,0 +1,216 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                         DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int_sse2)
+
+EXTN(jpeg_convsamp_int_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, _MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                         DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_int_sse2)
+
+EXTN(jpeg_quantize_int_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/32
+	alignx	16,7
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
+	psllw	xmm0,1
+	psllw	xmm1,1
+	psllw	xmm2,1
+	psllw	xmm3,1
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+	psllw	xmm0,1
+	psllw	xmm1,1
+	psllw	xmm2,1
+	psllw	xmm3,1
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 32*SIZEOF_DCTELEM
+	add	edx, byte 32*SIZEOF_DCTELEM
+	add	edi, byte 32*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
+%endif ; JFDCT_INT_SSE2_SUPPORTED
--- a/jcqntsse.asm
+++ b/jcqntsse.asm
@@ -0,0 +1,218 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 12, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                        FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_sse)
+
+EXTN(jpeg_convsamp_flt_sse):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+
+	movlhps   xmm0,xmm1			; xmm0=(0123)
+	movlhps   xmm2,xmm3			; xmm2=(4567)
+	movlhps   xmm4,xmm5			; xmm4=(89AB)
+	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                        FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_sse)
+
+EXTN(jpeg_quantize_flt_sse):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	movhlps  xmm4,xmm0
+	movhlps  xmm5,xmm1
+
+	cvtps2pi mm0,xmm0
+	cvtps2pi mm1,xmm1
+	cvtps2pi mm4,xmm4
+	cvtps2pi mm5,xmm5
+
+	movhlps  xmm6,xmm2
+	movhlps  xmm7,xmm3
+
+	cvtps2pi mm2,xmm2
+	cvtps2pi mm3,xmm3
+	cvtps2pi mm6,xmm6
+	cvtps2pi mm7,xmm7
+
+	packssdw mm0,mm4
+	packssdw mm1,mm5
+	packssdw mm2,mm6
+	packssdw mm3,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jcsammmx.asm
+++ b/jcsammmx.asm
@@ -0,0 +1,328 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JCSAMPLE_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v1_downsample_mmx (j_compress_ptr cinfo,
+;                           jpeg_component_info * compptr,
+;                           JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v1_downsample_mmx)
+
+EXTN(jpeg_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	short .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v2_downsample_mmx (j_compress_ptr cinfo,
+;                           jpeg_component_info * compptr,
+;                           JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v2_downsample_mmx)
+
+EXTN(jpeg_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JCSAMPLE_MMX_SUPPORTED
--- a/jcsample.c
+++ b/jcsample.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file contains downsampling routines.
 *
 * Downsampling input data is counted in "row groups".  A row group
@@ -48,6 +55,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */


 /* Pointer to routine to downsample a single component */
@@ -467,6 +475,7 @@ jinit_downsampler (j_compress_ptr cinfo)
  int ci;
  jpeg_component_info * compptr;
  boolean smoothok = TRUE;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  downsample = (my_downsample_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -494,7 +503,17 @@ jinit_downsampler (j_compress_ptr cinfo)
    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
      smoothok = FALSE;
-      downsample->methods[ci] = h2v1_downsample;
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2)
+	downsample->methods[ci] = jpeg_h2v1_downsample_sse2;
+      else
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	downsample->methods[ci] = jpeg_h2v1_downsample_mmx;
+      else
+#endif
+	downsample->methods[ci] = h2v1_downsample;
    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
@@ -502,6 +521,16 @@ jinit_downsampler (j_compress_ptr cinfo)
 	downsample->methods[ci] = h2v2_smooth_downsample;
 	downsample->pub.need_context_rows = TRUE;
      } else
+#endif
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2)
+	downsample->methods[ci] = jpeg_h2v2_downsample_sse2;
+      else
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	downsample->methods[ci] = jpeg_h2v2_downsample_mmx;
+      else
 #endif
 	downsample->methods[ci] = h2v2_downsample;
    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
@@ -517,3 +546,25 @@ jinit_downsampler (j_compress_ptr cinfo)
    TRACEMS(cinfo, 0, JTRC_SMOOTH_NOTIMPL);
 #endif
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_downsampler (j_compress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2)
+    return JSIMD_SSE2;
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jcsamss2.asm
+++ b/jcsamss2.asm
@@ -0,0 +1,355 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JCSAMPLE_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v1_downsample_sse2 (j_compress_ptr cinfo,
+;                            jpeg_component_info * compptr,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v1_downsample_sse2)
+
+EXTN(jpeg_h2v1_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	ecx,ecx
+	jnz	short .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v2_downsample_sse2 (j_compress_ptr cinfo,
+;                            jpeg_component_info * compptr,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v2_downsample_sse2)
+
+EXTN(jpeg_h2v2_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JCSAMPLE_SSE2_SUPPORTED
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : December 18, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains the coefficient buffer controller for decompression.
 * This controller is the top level of the JPEG decompressor proper.
 * The coefficient buffer lies between entropy decoding and inverse-DCT steps.
@@ -133,6 +140,11 @@ start_output_pass (j_decompress_ptr cinfo)
 }


+#ifndef NEED_FAR_POINTERS
+#undef jzero_far
+#define jzero_far(target, bytestozero)  MEMZERO(target, bytestozero)
+#endif
+
 /*
 * Decompress and return some data in the single-pass case.
 * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
@@ -150,15 +162,61 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  JDIMENSION MCU_col_num;	/* index of current MCU within row */
  JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
-  int blkn, ci, xindex, yindex, yoffset, useful_width;
+  int blkn, ci, ctr, xindex, yindex, yoffset;
  JSAMPARRAY output_ptr;
-  JDIMENSION start_col, output_col;
+  JDIMENSION output_col;
  jpeg_component_info *compptr;
  inverse_DCT_method_ptr inverse_DCT;
+  JSAMPARRAY output_ptr_blk[D_MAX_BLOCKS_IN_MCU];
+  JDIMENSION output_col_off[D_MAX_BLOCKS_IN_MCU];
+  jpeg_component_info *compptr_blk[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr inverse_DCT_blk_1[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr inverse_DCT_blk_2[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr *inverse_DCT_blk;

  /* Loop to process as much as one whole iMCU row */
  for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
       yoffset++) {
+    /* Determine where data should go in output_buf and do the IDCT thing.
+     * We skip dummy blocks at the right and bottom edges (but blkn gets
+     * incremented past them!).  Note the inner loop relies on having
+     * allocated the MCU_buffer[] blocks sequentially.
+     */
+    blkn = 0;			/* index of current DCT block within MCU */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Don't bother to IDCT an uninteresting component. */
+      if (! compptr->component_needed) {
+	for (ctr = compptr->MCU_blocks; ctr > 0; ctr--) {
+	  inverse_DCT_blk_1[blkn] = inverse_DCT_blk_2[blkn] = NULL;
+	  blkn++;
+	}
+	continue;
+      }
+      inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+      output_ptr = output_buf[compptr->component_index] +
+	yoffset * compptr->DCT_scaled_size;
+      for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+	if (cinfo->input_iMCU_row < last_iMCU_row ||
+	    yoffset+yindex < compptr->last_row_height) {
+	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+	    compptr_blk[blkn] = compptr;
+	    output_ptr_blk[blkn] = output_ptr;
+	    output_col_off[blkn] = xindex * compptr->DCT_scaled_size;
+	    inverse_DCT_blk_1[blkn] = inverse_DCT;
+	    inverse_DCT_blk_2[blkn] = (xindex < compptr->last_col_width) ?
+				      inverse_DCT : NULL;
+	    blkn++;
+	  }
+	} else {
+	  for (ctr = compptr->MCU_width; ctr > 0; ctr--) {
+	    inverse_DCT_blk_1[blkn] = inverse_DCT_blk_2[blkn] = NULL;
+	    blkn++;
+	  }
+	}
+	output_ptr += compptr->DCT_scaled_size;
+      }
+    }
    for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
      /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
@@ -170,39 +228,17 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	coef->MCU_ctr = MCU_col_num;
 	return JPEG_SUSPENDED;
      }
-      /* Determine where data should go in output_buf and do the IDCT thing.
-       * We skip dummy blocks at the right and bottom edges (but blkn gets
-       * incremented past them!).  Note the inner loop relies on having
-       * allocated the MCU_buffer[] blocks sequentially.
-       */
-      blkn = 0;			/* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	/* Don't bother to IDCT an uninteresting component. */
-	if (! compptr->component_needed) {
-	  blkn += compptr->MCU_blocks;
+      inverse_DCT_blk = (MCU_col_num < last_MCU_col) ? inverse_DCT_blk_1
+						     : inverse_DCT_blk_2;
+      for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+	inverse_DCT = inverse_DCT_blk[blkn];
+	if (inverse_DCT == NULL)
 	  continue;
-	}
-	inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						    : compptr->last_col_width;
-	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
-	start_col = MCU_col_num * compptr->MCU_sample_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  if (cinfo->input_iMCU_row < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
-	    output_col = start_col;
-	    for (xindex = 0; xindex < useful_width; xindex++) {
-	      (*inverse_DCT) (cinfo, compptr,
-			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
-	    }
-	  }
-	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
-	}
+	compptr = compptr_blk[blkn];
+	output_col = MCU_col_num * compptr->MCU_sample_width +
+		     output_col_off[blkn];
+	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) coef->MCU_buffer[blkn],
+			output_ptr_blk[blkn], output_col);
      }
    }
    /* Completed an MCU row, but perhaps not an iMCU row */
@@ -250,6 +286,8 @@ consume_data (j_decompress_ptr cinfo)
  JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
  JBLOCKROW buffer_ptr;
  jpeg_component_info *compptr;
+  int MCU_width[D_MAX_BLOCKS_IN_MCU];
+  JBLOCKROW MCU_buffer_base[D_MAX_BLOCKS_IN_MCU];

  /* Align the virtual buffers for the components used in this scan. */
  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
@@ -267,19 +305,24 @@ consume_data (j_decompress_ptr cinfo)
  /* Loop to process one whole iMCU row */
  for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
       yoffset++) {
+    /* Construct list of pointers to DCT blocks belonging to this MCU */
+    blkn = 0;			/* index of current DCT block within MCU */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+	buffer_ptr = buffer[ci][yindex+yoffset];
+	for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+	  MCU_width[blkn] = compptr->MCU_width;
+	  MCU_buffer_base[blkn] = buffer_ptr++;
+	  blkn++;
+	}
+      }
+    }
    for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
 	 MCU_col_num++) {
-      /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
-	}
+      for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+	start_col = MCU_col_num * MCU_width[blkn];
+	coef->MCU_buffer[blkn] = MCU_buffer_base[blkn] + start_col;
      }
      /* Try to fetch the MCU. */
      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
@@ -453,6 +496,15 @@ smoothing_ok (j_decompress_ptr cinfo)
 }


+/*
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
+ */
+
+#define ALIGN_SIZE	16		/* sizeof SSE/SSE2 register */
+#define ALIGN_MEM(p,a)	((void *) (((size_t) (p) + (a) - 1) & -(a)))
+
 /*
 * Variant of decompress_data for use when doing block smoothing.
 */
@@ -471,7 +523,8 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  jpeg_component_info *compptr;
  inverse_DCT_method_ptr inverse_DCT;
  boolean first_row, last_row;
-  JBLOCK workspace;
+  JCOEF workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(JCOEF)];
+  JCOEF * workptr = (JCOEF *) ALIGN_MEM(workspace, ALIGN_SIZE);
  int *coef_bits;
  JQUANT_TBL *quanttbl;
  INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
@@ -560,7 +613,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
      last_block_column = compptr->width_in_blocks - 1;
      for (block_num = 0; block_num <= last_block_column; block_num++) {
 	/* Fetch current DCT block into workspace so we can modify it. */
-	jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+	jcopy_block_row(buffer_ptr, (JBLOCKROW) workptr, (JDIMENSION) 1);
 	/* Update DC values */
 	if (block_num < last_block_column) {
 	  DC3 = (int) prev_block_row[1][0];
@@ -572,7 +625,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	 * and is not known to be fully accurate.
 	 */
 	/* AC01 */
-	if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
+	if ((Al=coef_bits[1]) != 0 && workptr[1] == 0) {
 	  num = 36 * Q00 * (DC4 - DC6);
 	  if (num >= 0) {
 	    pred = (int) (((Q01<<7) + num) / (Q01<<8));
@@ -584,10 +637,10 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[1] = (JCOEF) pred;
+	  workptr[1] = (JCOEF) pred;
 	}
 	/* AC10 */
-	if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
+	if ((Al=coef_bits[2]) != 0 && workptr[8] == 0) {
 	  num = 36 * Q00 * (DC2 - DC8);
 	  if (num >= 0) {
 	    pred = (int) (((Q10<<7) + num) / (Q10<<8));
@@ -599,10 +652,10 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[8] = (JCOEF) pred;
+	  workptr[8] = (JCOEF) pred;
 	}
 	/* AC20 */
-	if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
+	if ((Al=coef_bits[3]) != 0 && workptr[16] == 0) {
 	  num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
 	  if (num >= 0) {
 	    pred = (int) (((Q20<<7) + num) / (Q20<<8));
@@ -614,10 +667,10 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[16] = (JCOEF) pred;
+	  workptr[16] = (JCOEF) pred;
 	}
 	/* AC11 */
-	if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
+	if ((Al=coef_bits[4]) != 0 && workptr[9] == 0) {
 	  num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
 	  if (num >= 0) {
 	    pred = (int) (((Q11<<7) + num) / (Q11<<8));
@@ -629,10 +682,10 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[9] = (JCOEF) pred;
+	  workptr[9] = (JCOEF) pred;
 	}
 	/* AC02 */
-	if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
+	if ((Al=coef_bits[5]) != 0 && workptr[2] == 0) {
 	  num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
 	  if (num >= 0) {
 	    pred = (int) (((Q02<<7) + num) / (Q02<<8));
@@ -644,10 +697,10 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[2] = (JCOEF) pred;
+	  workptr[2] = (JCOEF) pred;
 	}
 	/* OK, do the IDCT */
-	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
+	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workptr,
 			output_ptr, output_col);
 	/* Advance for next column */
 	DC1 = DC2; DC2 = DC3;
--- a/jdcolmmx.asm
+++ b/jdcolmmx.asm
@@ -0,0 +1,438 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_ycc_rgb_convert_mmx (j_decompress_ptr cinfo,
+;                           JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                           JSAMPARRAY output_buf, int num_rows)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_ycc_rgb_convert_mmx)
+
+EXTN(jpeg_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JDCOLOR_YCCRGB_MMX_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -5,12 +5,20 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file contains output colorspace conversion routines.
 */

 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */


 /* Private subobject */
@@ -105,6 +113,17 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
 }


+#if RGB_PIXELSIZE == 4
+/* offset of filler byte */
+#define RGB_FILLER  (6 - (RGB_RED) - (RGB_GREEN) - (RGB_BLUE))
+/* byte pattern to fill with */
+#ifdef RGBX_FILLER_0XFF
+#define RGB_FILLER_BYTE 0xFF
+#else
+#define RGB_FILLER_BYTE 0x00
+#endif
+#endif /* RGB_PIXELSIZE == 4 */
+
 /*
 * Convert some rows of samples to the output colorspace.
 *
@@ -151,6 +170,9 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
 			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
 						 SCALEBITS))];
      outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
+#if RGB_PIXELSIZE == 4
+      outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
      outptr += RGB_PIXELSIZE;
    }
  }
@@ -228,6 +250,9 @@ gray_rgb_convert (j_decompress_ptr cinfo,
    for (col = 0; col < num_cols; col++) {
      /* We can dispense with GETJSAMPLE() here */
      outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
+#if RGB_PIXELSIZE == 4
+      outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
      outptr += RGB_PIXELSIZE;
    }
  }
@@ -305,6 +330,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
 {
  my_cconvert_ptr cconvert;
  int ci;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -358,8 +384,23 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
  case JCS_RGB:
    cinfo->out_color_components = RGB_PIXELSIZE;
    if (cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = ycc_rgb_convert;
-      build_ycc_rgb_table(cinfo);
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_ycc_rgb_convert_sse2)) {
+        cconvert->pub.color_convert = jpeg_ycc_rgb_convert_sse2;
+      } else
+#endif
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+      if (simd & JSIMD_MMX) {
+        cconvert->pub.color_convert = jpeg_ycc_rgb_convert_mmx;
+      } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+      {
+        cconvert->pub.color_convert = ycc_rgb_convert;
+        build_ycc_rgb_table(cinfo);
+      }
    } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
      cconvert->pub.color_convert = gray_rgb_convert;
    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
@@ -394,3 +435,28 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
  else
    cinfo->output_components = cinfo->out_color_components;
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_color_deconverter (j_decompress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_ycc_rgb_convert_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jdcolss2.asm
+++ b/jdcolss2.asm
@@ -0,0 +1,536 @@
+;
+; jdcolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_ycc_rgb_convert_sse2 (j_decompress_ptr cinfo,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_ycc_rgb_convert_sse2)
+
+EXTN(jpeg_ycc_rgb_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .nextrow
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JDCOLOR_YCCRGB_SSE2_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
--- a/jdct.h
+++ b/jdct.h
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This include file contains common declarations for the forward and
 * inverse DCT modules.  These declarations are private to the DCT managers
 * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms.
@@ -13,6 +20,13 @@
 */


+/* SIMD Ext: configuration check */
+
+#if BITS_IN_JSAMPLE != 8
+#error "Sorry, this SIMD code only copes with 8-bit sample values."
+#endif
+
+
 /*
 * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
 * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
@@ -26,14 +40,25 @@
 * Quantization of the output coefficients is done by jcdctmgr.c.
 */

-#if BITS_IN_JSAMPLE == 8
-typedef int DCTELEM;		/* 16 or 32 bits is fine */
-#else
-typedef INT32 DCTELEM;		/* must have 32 bits */
-#endif
+/* SIMD Ext: To maximize parallelism, Type DCTELEM is changed to short
+ * (originally, int).
+ */
+typedef short DCTELEM;		/* SIMD Ext: must be short */

 typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
 typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, convsamp_int_method_ptr,
+		(JSAMPARRAY sample_data, JDIMENSION start_col,
+		 DCTELEM * workspace));
+typedef JMETHOD(void, convsamp_float_method_ptr,
+		(JSAMPARRAY sample_data, JDIMENSION start_col,
+		 FAST_FLOAT *workspace));
+typedef JMETHOD(void, quantize_int_method_ptr,
+		(JCOEFPTR coef_block, DCTELEM * divisors,
+		 DCTELEM * workspace));
+typedef JMETHOD(void, quantize_float_method_ptr,
+		(JCOEFPTR coef_block, FAST_FLOAT * divisors,
+		 FAST_FLOAT * workspace));


 /*
@@ -49,19 +74,22 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));

 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */

+/* SIMD Ext: To maximize parallelism, Type MULTIPLIER is changed to short.
+ * Macro definitions of MULTIPLIER and FAST_FLOAT in jmorecfg.h are ignored.
+ */
+#undef MULTIPLIER
+#define MULTIPLIER  short	/* SIMD Ext: must be short */
+#undef FAST_FLOAT
+#define FAST_FLOAT  float	/* SIMD Ext: must be float */
+
 /*
 * Each IDCT routine has its own ideas about the best dct_table element type.
 */

-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
-#if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE;	/* SIMD Ext: must be short */
+typedef MULTIPLIER IFAST_MULT_TYPE;	/* SIMD Ext: must be short */
 #define IFAST_SCALE_BITS  2	/* fractional bits in scale factors */
-#else
-typedef INT32 IFAST_MULT_TYPE;	/* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13	/* fractional bits in scale factors */
-#endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE;	/* SIMD Ext: must be float */


 /*
@@ -81,15 +109,64 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 /* Short forms of external names for systems with brain-damaged linkers. */

 #ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_fdct_islow		jFDislow
-#define jpeg_fdct_ifast		jFDifast
-#define jpeg_fdct_float		jFDfloat
-#define jpeg_idct_islow		jRDislow
-#define jpeg_idct_ifast		jRDifast
-#define jpeg_idct_float		jRDfloat
-#define jpeg_idct_4x4		jRD4x4
-#define jpeg_idct_2x2		jRD2x2
-#define jpeg_idct_1x1		jRD1x1
+#define jpeg_fdct_islow		jFDislow		/* jfdctint.asm */
+#define jpeg_fdct_ifast		jFDifast		/* jfdctfst.asm */
+#define jpeg_fdct_float		jFDfloat		/* jfdctflt.asm */
+#define jpeg_fdct_islow_mmx	jFDMislow		/* jfmmxint.asm */
+#define jpeg_fdct_ifast_mmx	jFDMifast		/* jfmmxfst.asm */
+#define jpeg_fdct_float_3dnow	jFD3float		/* jf3dnflt.asm */
+#define jpeg_fdct_islow_sse2	jFDSislow		/* jfss2int.asm */
+#define jpeg_fdct_ifast_sse2	jFDSifast		/* jfss2fst.asm */
+#define jpeg_fdct_float_sse	jFDSfloat		/* jfsseflt.asm */
+#define jpeg_convsamp_int	jCnvInt			/* jcqntint.asm */
+#define jpeg_quantize_int	jQntInt			/* jcqntint.asm */
+#define jpeg_quantize_idiv	jQntIDiv		/* jcqntint.asm */
+#define jpeg_convsamp_float	jCnvFloat		/* jcqntflt.asm */
+#define jpeg_quantize_float	jQntFloat		/* jcqntflt.asm */
+#define jpeg_convsamp_int_mmx	jCnvMmx			/* jcqntmmx.asm */
+#define jpeg_quantize_int_mmx	jQntMmx			/* jcqntmmx.asm */
+#define jpeg_convsamp_flt_3dnow	jCnv3dnow		/* jcqnt3dn.asm */
+#define jpeg_quantize_flt_3dnow	jQnt3dnow		/* jcqnt3dn.asm */
+#define jpeg_convsamp_int_sse2	jCnvISse2		/* jcqnts2i.asm */
+#define jpeg_quantize_int_sse2	jQntISse2		/* jcqnts2i.asm */
+#define jpeg_convsamp_flt_sse	jCnvSse			/* jcqntsse.asm */
+#define jpeg_quantize_flt_sse	jQntSse			/* jcqntsse.asm */
+#define jpeg_convsamp_flt_sse2	jCnvFSse2		/* jcqnts2f.asm */
+#define jpeg_quantize_flt_sse2	jQntFSse2		/* jcqnts2f.asm */
+#define jpeg_idct_islow		jRDislow		/* jidctint.asm */
+#define jpeg_idct_ifast		jRDifast		/* jidctfst.asm */
+#define jpeg_idct_float		jRDfloat		/* jidctflt.asm */
+#define jpeg_idct_4x4		jRD4x4			/* jidctred.asm */
+#define jpeg_idct_2x2		jRD2x2			/* jidctred.asm */
+#define jpeg_idct_1x1		jRD1x1			/* jidctred.asm */
+#define jpeg_idct_islow_mmx	jRDMislow		/* jimmxint.asm */
+#define jpeg_idct_ifast_mmx	jRDMifast		/* jimmxfst.asm */
+#define jpeg_idct_float_3dnow	jRD3float		/* ji3dnflt.asm */
+#define jpeg_idct_4x4_mmx	jRDM4x4			/* jimmxred.asm */
+#define jpeg_idct_2x2_mmx	jRDM2x2			/* jimmxred.asm */
+#define jpeg_idct_islow_sse2	jRDSislow		/* jiss2int.asm */
+#define jpeg_idct_ifast_sse2	jRDSifast		/* jiss2fst.asm */
+#define jpeg_idct_float_sse	jRDSfloat		/* jisseflt.asm */
+#define jpeg_idct_float_sse2	jRD2float		/* jiss2flt.asm */
+#define jpeg_idct_4x4_sse2	jRDS4x4			/* jiss2red.asm */
+#define jpeg_idct_2x2_sse2	jRDS2x2			/* jiss2red.asm */
+#define jconst_fdct_float	jFCfloat		/* jfdctflt.asm */
+#define jconst_fdct_islow_mmx	jFCMislow		/* jfmmxint.asm */
+#define jconst_fdct_ifast_mmx	jFCMifast		/* jfmmxfst.asm */
+#define jconst_fdct_float_3dnow	jFC3float		/* jf3dnflt.asm */
+#define jconst_fdct_islow_sse2	jFCSislow		/* jfss2int.asm */
+#define jconst_fdct_ifast_sse2	jFCSifast		/* jfss2fst.asm */
+#define jconst_fdct_float_sse	jFCSfloat		/* jfsseflt.asm */
+#define jconst_idct_float	jRCfloat		/* jidctflt.asm */
+#define jconst_idct_islow_mmx	jRCMislow		/* jimmxint.asm */
+#define jconst_idct_ifast_mmx	jRCMifast		/* jimmxfst.asm */
+#define jconst_idct_float_3dnow	jRC3float		/* ji3dnflt.asm */
+#define jconst_idct_red_mmx	jRCMred			/* jimmxred.asm */
+#define jconst_idct_islow_sse2	jRCSislow		/* jiss2int.asm */
+#define jconst_idct_ifast_sse2	jRCSifast		/* jiss2fst.asm */
+#define jconst_idct_float_sse	jRCSfloat		/* jisseflt.asm */
+#define jconst_idct_float_sse2	jRC2float		/* jiss2flt.asm */
+#define jconst_idct_red_sse2	jRCSred			/* jiss2red.asm */
 #endif /* NEED_SHORT_EXTERNAL_NAMES */

 /* Extern declarations for the forward and inverse DCT routines. */
@@ -98,6 +175,47 @@ EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
 EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
 EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));

+EXTERN(void) jpeg_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_ifast_mmx JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+EXTERN(void) jpeg_fdct_islow_sse2 JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_ifast_sse2 JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_float_sse JPP((FAST_FLOAT * data));
+
+EXTERN(void) jpeg_convsamp_int
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_idiv
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_float
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_float
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
+EXTERN(void) jpeg_convsamp_int_mmx
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int_mmx
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_flt_3dnow
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_3dnow
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
+EXTERN(void) jpeg_convsamp_int_sse2
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int_sse2
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_flt_sse
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_sse
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+EXTERN(void) jpeg_convsamp_flt_sse2
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_sse2
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
 EXTERN(void) jpeg_idct_islow
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
@@ -117,6 +235,60 @@ EXTERN(void) jpeg_idct_1x1
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));

+EXTERN(void) jpeg_idct_islow_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_ifast_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x4_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x2_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+EXTERN(void) jpeg_idct_float_3dnow
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_float_sse
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_float_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+EXTERN(void) jpeg_idct_islow_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_ifast_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x4_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x2_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+extern const int jconst_fdct_float[];
+extern const int jconst_fdct_islow_mmx[];
+extern const int jconst_fdct_ifast_mmx[];
+extern const int jconst_fdct_float_3dnow[];
+extern const int jconst_fdct_islow_sse2[];
+extern const int jconst_fdct_ifast_sse2[];
+extern const int jconst_fdct_float_sse[];
+extern const int jconst_idct_float[];
+extern const int jconst_idct_islow_mmx[];
+extern const int jconst_idct_ifast_mmx[];
+extern const int jconst_idct_float_3dnow[];
+extern const int jconst_idct_red_mmx[];
+extern const int jconst_idct_islow_sse2[];
+extern const int jconst_idct_ifast_sse2[];
+extern const int jconst_idct_float_sse[];
+extern const int jconst_idct_float_sse2[];
+extern const int jconst_idct_red_sse2[];
+

 /*
 * Macros for handling fixed-point arithmetic; these are used by many
--- a/jdct.inc
+++ b/jdct.inc
@@ -0,0 +1,125 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; Last Modified : January 5, 2006
+;
+; [TAB8]
+
+; ---- jdct.h --------------------------------------------------------------
+;
+; configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+; valid setting on this SIMD extension.
+;
+%if BITS_IN_JSAMPLE != 8
+%error "Sorry, this SIMD code only copes with 8-bit sample values."
+%endif
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM			word		; short
+%define SIZEOF_DCTELEM		SIZEOF_WORD	; sizeof(DCTELEM)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define MULTIPLIER		word		; short
+%define SIZEOF_MULTIPLIER	SIZEOF_WORD	; sizeof(MULTIPLIER)
+%define FAST_FLOAT		FP32		; float
+%define SIZEOF_FAST_FLOAT	SIZEOF_FP32	; sizeof(FAST_FLOAT)
+
+; Each IDCT routine has its own ideas about the best dct_table element type.
+;
+%define ISLOW_MULT_TYPE 	MULTIPLIER          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE	SIZEOF_MULTIPLIER   ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE 	MULTIPLIER          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE	SIZEOF_MULTIPLIER   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS	2	; fractional bits in scale factors
+%define FLOAT_MULT_TYPE 	FAST_FLOAT          ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE	SIZEOF_FAST_FLOAT   ; sizeof(FLOAT_MULT_TYPE)
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_fdct_islow		jFDislow	; jfdctint.asm
+%define jpeg_fdct_ifast		jFDifast	; jfdctfst.asm
+%define jpeg_fdct_float		jFDfloat	; jfdctflt.asm
+%define jpeg_fdct_islow_mmx	jFDMislow	; jfmmxint.asm
+%define jpeg_fdct_ifast_mmx	jFDMifast	; jfmmxfst.asm
+%define jpeg_fdct_float_3dnow	jFD3float	; jf3dnflt.asm
+%define jpeg_fdct_islow_sse2	jFDSislow	; jfss2int.asm
+%define jpeg_fdct_ifast_sse2	jFDSifast	; jfss2fst.asm
+%define jpeg_fdct_float_sse	jFDSfloat	; jfsseflt.asm
+%define jpeg_convsamp_int	jCnvInt		; jcqntint.asm
+%define jpeg_quantize_int	jQntInt		; jcqntint.asm
+%define jpeg_quantize_idiv	jQntIDiv	; jcqntint.asm
+%define jpeg_convsamp_float	jCnvFloat	; jcqntflt.asm
+%define jpeg_quantize_float	jQntFloat	; jcqntflt.asm
+%define jpeg_convsamp_int_mmx	jCnvMmx		; jcqntmmx.asm
+%define jpeg_quantize_int_mmx	jQntMmx		; jcqntmmx.asm
+%define jpeg_convsamp_flt_3dnow	jCnv3dnow	; jcqnt3dn.asm
+%define jpeg_quantize_flt_3dnow	jQnt3dnow	; jcqnt3dn.asm
+%define jpeg_convsamp_int_sse2	jCnvISse2	; jcqnts2i.asm
+%define jpeg_quantize_int_sse2	jQntISse2	; jcqnts2i.asm
+%define jpeg_convsamp_flt_sse	jCnvSse		; jcqntsse.asm
+%define jpeg_quantize_flt_sse	jQntSse		; jcqntsse.asm
+%define jpeg_convsamp_flt_sse2	jCnvFSse2	; jcqnts2f.asm
+%define jpeg_quantize_flt_sse2	jQntFSse2	; jcqnts2f.asm
+%define jpeg_idct_islow		jRDislow	; jidctint.asm
+%define jpeg_idct_ifast		jRDifast	; jidctfst.asm
+%define jpeg_idct_float		jRDfloat	; jidctflt.asm
+%define jpeg_idct_4x4		jRD4x4		; jidctred.asm
+%define jpeg_idct_2x2		jRD2x2		; jidctred.asm
+%define jpeg_idct_1x1		jRD1x1		; jidctred.asm
+%define jpeg_idct_islow_mmx	jRDMislow	; jimmxint.asm
+%define jpeg_idct_ifast_mmx	jRDMifast	; jimmxfst.asm
+%define jpeg_idct_float_3dnow	jRD3float	; ji3dnflt.asm
+%define jpeg_idct_4x4_mmx	jRDM4x4		; jimmxred.asm
+%define jpeg_idct_2x2_mmx	jRDM2x2		; jimmxred.asm
+%define jpeg_idct_islow_sse2	jRDSislow	; jiss2int.asm
+%define jpeg_idct_ifast_sse2	jRDSifast	; jiss2fst.asm
+%define jpeg_idct_float_sse	jRDSfloat	; jisseflt.asm
+%define jpeg_idct_float_sse2	jRD2float	; jiss2flt.asm
+%define jpeg_idct_4x4_sse2	jRDS4x4		; jiss2red.asm
+%define jpeg_idct_2x2_sse2	jRDS2x2		; jiss2red.asm
+%define jconst_fdct_float	jFCfloat	; jfdctflt.asm
+%define jconst_fdct_islow_mmx	jFCMislow	; jfmmxint.asm
+%define jconst_fdct_ifast_mmx	jFCMifast	; jfmmxfst.asm
+%define jconst_fdct_float_3dnow	jFC3float	; jf3dnflt.asm
+%define jconst_fdct_islow_sse2	jFCSislow	; jfss2int.asm
+%define jconst_fdct_ifast_sse2	jFCSifast	; jfss2fst.asm
+%define jconst_fdct_float_sse	jFCSfloat	; jfsseflt.asm
+%define jconst_idct_float	jRCfloat	; jidctflt.asm
+%define jconst_idct_islow_mmx	jRCMislow	; jimmxint.asm
+%define jconst_idct_ifast_mmx	jRCMifast	; jimmxfst.asm
+%define jconst_idct_float_3dnow	jRC3float	; ji3dnflt.asm
+%define jconst_idct_red_mmx	jRCMred		; jimmxred.asm
+%define jconst_idct_islow_sse2	jRCSislow	; jiss2int.asm
+%define jconst_idct_ifast_sse2	jRCSifast	; jiss2fst.asm
+%define jconst_idct_float_sse	jRCSfloat	; jisseflt.asm
+%define jconst_idct_float_sse2	jRC2float	; jiss2flt.asm
+%define jconst_idct_red_sse2	jRCSred		; jiss2red.asm
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; --------------------------------------------------------------------------
+
+%define ROW(n,b,s)		((b)+(n)*(s))
+%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+
+; --------------------------------------------------------------------------
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : December 24, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains the inverse-DCT management logic.
 * This code selects a particular IDCT implementation to be used,
 * and it performs related housekeeping chores.  No code in this file
@@ -94,6 +101,7 @@ start_pass (j_decompress_ptr cinfo)
  int method = 0;
  inverse_DCT_method_ptr method_ptr = NULL;
  JQUANT_TBL * qtbl;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
@@ -105,34 +113,95 @@ start_pass (j_decompress_ptr cinfo)
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
    case 2:
-      method_ptr = jpeg_idct_2x2;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+	method_ptr = jpeg_idct_2x2_sse2;
+      else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	method_ptr = jpeg_idct_2x2_mmx;
+      else
+#endif
+	method_ptr = jpeg_idct_2x2;
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
    case 4:
-      method_ptr = jpeg_idct_4x4;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+	method_ptr = jpeg_idct_4x4_sse2;
+      else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	method_ptr = jpeg_idct_4x4_mmx;
+      else
+#endif
+	method_ptr = jpeg_idct_4x4;
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
-#endif
+#endif /* IDCT_SCALING_SUPPORTED */
    case DCTSIZE:
      switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
      case JDCT_ISLOW:
-	method_ptr = jpeg_idct_islow;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_islow_sse2))
+	  method_ptr = jpeg_idct_islow_sse2;
+	else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  method_ptr = jpeg_idct_islow_mmx;
+	else
+#endif
+	  method_ptr = jpeg_idct_islow;
 	method = JDCT_ISLOW;
 	break;
-#endif
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
      case JDCT_IFAST:
-	method_ptr = jpeg_idct_ifast;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_ifast_sse2))
+	  method_ptr = jpeg_idct_ifast_sse2;
+	else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  method_ptr = jpeg_idct_ifast_mmx;
+	else
+#endif
+	  method_ptr = jpeg_idct_ifast;
 	method = JDCT_IFAST;
 	break;
-#endif
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
      case JDCT_FLOAT:
-	method_ptr = jpeg_idct_float;
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_float_sse2))
+	  method_ptr = jpeg_idct_float_sse2;
+	else
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+	if (simd & JSIMD_SSE &&
+	    IS_CONST_ALIGNED_16(jconst_idct_float_sse))
+	  method_ptr = jpeg_idct_float_sse;
+	else
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+	if (simd & JSIMD_3DNOW)
+	  method_ptr = jpeg_idct_float_3dnow;
+	else
+#endif
+	  method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
-#endif
+#endif /* DCT_FLOAT_SUPPORTED */
      default:
 	ERREXIT(cinfo, JERR_NOT_COMPILED);
 	break;
@@ -267,3 +336,78 @@ jinit_inverse_dct (j_decompress_ptr cinfo)
    idct->cur_method[ci] = -1;
  }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_inverse_dct (j_decompress_ptr cinfo, int method)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+  switch (method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_islow_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_ISLOW_SUPPORTED */
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_ifast_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_IFAST_SUPPORTED */
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_float_sse2))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_SSE2); */
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_idct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_MMX); */
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW)
+      return JSIMD_3DNOW;	/* (JSIMD_3DNOW | JSIMD_MMX); */
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_FLOAT_SUPPORTED */
+#ifdef IDCT_SCALING_SUPPORTED
+  case JDCT_FLOAT + 1:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* IDCT_SCALING_SUPPORTED */
+  default:
+    ;
+  }
+
+  return JSIMD_NONE;	/* not compiled */
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
 * This file contains Huffman entropy decoding routines.
 *
 * Much of the complexity here has to do with supporting input suspension.
@@ -151,8 +158,8 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 {
  JHUFF_TBL *htbl;
  d_derived_tbl *dtbl;
-  int p, i, l, si, numsymbols;
-  int lookbits, ctr;
+  int p, i, l, la, lx, si, numsymbols;
+  int lookbits, look_end, sym, val, ctr;
  char huffsize[257];
  unsigned int huffcode[257];
  unsigned int code;
@@ -234,18 +241,34 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
   * with that code.
   */

-  MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
+  MEMZERO(dtbl->lookx_nbits, SIZEOF(dtbl->lookx_nbits));

  p = 0;
-  for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
+  for (l = 1; l <= HUFFX_LOOKAHEAD-1; l++) {
    for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
      /* l = current code's length, p = its index in huffcode[] & huffval[]. */
      /* Generate left-justified code followed by all possible bit sequences */
-      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
-      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
-	dtbl->look_nbits[lookbits] = l;
-	dtbl->look_sym[lookbits] = htbl->huffval[p];
-	lookbits++;
+      sym = htbl->huffval[p];		/* current symbol */
+      la = sym & 15;			/* length of additional bits field */
+      lx = HUFFX_LOOKAHEAD - l;
+      lookbits = huffcode[p] << lx;
+      look_end = lookbits + (1 << lx);
+      lx -= la;
+      while (lookbits < look_end) {
+	if (lx >= 0) {
+	  val = (lookbits >>  lx) & ((1 << la) - 1);
+	  ctr = 1 << lx;
+	} else {
+	  val = (lookbits << -lx) & ((1 << la) - 1);
+	  ctr = 1;
+	}
+	val = HUFF_EXTEND(val, la);
+	for (; ctr > 0; ctr--) {
+	  dtbl->lookx_nbits[lookbits] = l + la;
+	  dtbl->lookx_val[lookbits] = val;
+	  dtbl->lookx_sym[lookbits] = sym;
+	  lookbits++;
+	}
      }
    }
  }
@@ -271,23 +294,8 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 * See jdhuff.h for info about usage.
 * Note: current values of get_buffer and bits_left are passed as parameters,
 * but are returned in the corresponding fields of the state struct.
- *
- * On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
- * of get_buffer to be used.  (On machines with wider words, an even larger
- * buffer could be used.)  However, on some machines 32-bit shifts are
- * quite slow and take time proportional to the number of places shifted.
- * (This is true with most PC compilers, for instance.)  In this case it may
- * be a win to set MIN_GET_BITS to the minimum value of 15.  This reduces the
- * average shift distance at the cost of more calls to jpeg_fill_bit_buffer.
 */

-#ifdef SLOW_SHIFT_32
-#define MIN_GET_BITS  15	/* minimum allowable value */
-#else
-#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
-#endif
-
-
 GLOBAL(boolean)
 jpeg_fill_bit_buffer (bitread_working_state * state,
 		      register bit_buf_type get_buffer, register int bits_left,
@@ -433,32 +441,6 @@ jpeg_huff_decode (bitread_working_state * state,
 }


-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
 /*
 * Check for a restart marker & resynchronize decoder.
 * Returns FALSE if must suspend.
@@ -548,13 +530,59 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
      /* Decode a single block's worth of coefficients */

      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
+      {		/* HUFFX_DECODE */
+	register int nb, look, t;
+	if (bits_left < HUFFX_LOOKAHEAD) {
+	  register const JOCTET * next_input_byte = br_state.next_input_byte;
+	  register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	  if (cinfo->unread_marker == 0) {
+	    while (bits_left < MIN_GET_BITS) {
+	      register int c;
+	      if (bytes_in_buffer == 0 ||
+		  (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		goto label11; }
+	      bytes_in_buffer--; next_input_byte++;
+	      get_buffer = (get_buffer << 8) | c;
+	      bits_left += 8;
+	    }
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	  } else {
+	label11:
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	    if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+	      return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      nb = 1; goto label1;
+	    }
+	  }
+	}
+	look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	if ((nb = dctbl->lookx_nbits[look]) != 0) {
+	  s = dctbl->lookx_val[look];
+	  if (nb <= HUFFX_LOOKAHEAD) {
+	    DROP_BITS(nb);
+	  } else {
+	    DROP_BITS(HUFFX_LOOKAHEAD);
+	    nb -= HUFFX_LOOKAHEAD;
+	    CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	    s += GET_BITS(nb);
+	  }
+	} else {
+	  nb = HUFFX_LOOKAHEAD;
+      label1:
+	  if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,dctbl,nb))
+	       < 0) { return FALSE; }
+	  get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	  if (s) {
+	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	    t = GET_BITS(s);
+	    s = HUFF_EXTEND(t, s);
+	  }
+	}
      }
-
      if (entropy->dc_needed[blkn]) {
 	/* Convert DC difference to actual value, update last_dc_val */
 	int ci = cinfo->MCU_membership[blkn];
@@ -569,16 +597,65 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* Since zeroes are skipped, output area must be cleared beforehand */
 	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
+	  {	/* HUFFX_DECODE */
+	    register int nb, look, t;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      register const JOCTET * next_input_byte
+					      = br_state.next_input_byte;
+	      register size_t bytes_in_buffer = br_state.bytes_in_buffer;
+	      if (cinfo->unread_marker == 0) {
+		while (bits_left < MIN_GET_BITS) {
+		  register int c;
+		  if (bytes_in_buffer == 0 ||
+		      (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		    goto label21; }
+		  bytes_in_buffer--; next_input_byte++;
+		  get_buffer = (get_buffer << 8) | c;
+		  bits_left += 8;
+		}
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+	      } else {
+	    label21:
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+		if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left,0)) {
+		  return FALSE; }
+		get_buffer = br_state.get_buffer;
+		bits_left  = br_state.bits_left;
+		if (bits_left < HUFFX_LOOKAHEAD) {
+		  nb = 1; goto label2;
+		}
+	      }
+	    }
+	    look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	    if ((nb = actbl->lookx_nbits[look]) != 0) {
+	      s = actbl->lookx_val[look];
+	      r = actbl->lookx_sym[look] >> 4;
+	      if (nb <= HUFFX_LOOKAHEAD) {
+		DROP_BITS(nb);
+	      } else {
+		DROP_BITS(HUFFX_LOOKAHEAD);
+		nb -= HUFFX_LOOKAHEAD;
+		CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+		s += GET_BITS(nb);
+	      }
+	    } else {
+	      nb = HUFFX_LOOKAHEAD;
+	  label2:
+	      if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,actbl,nb))
+		   < 0) { return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      r = s >> 4; s &= 15;
+	      if (s) {
+		CHECK_BIT_BUFFER(br_state, s, return FALSE);
+		t = GET_BITS(s);
+		s = HUFF_EXTEND(t, s);
+	      }
+	    }
+	  }
 	  if (s) {
 	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    r = GET_BITS(s);
-	    s = HUFF_EXTEND(r, s);
 	    /* Output coefficient in natural (dezigzagged) order.
 	     * Note: the extra entries in jpeg_natural_order[] will save us
 	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
@@ -596,15 +673,64 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* In this path we just discard the values */
 	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
+	  {	/* HUFFX_DECODE */
+	    register int nb, look;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      register const JOCTET * next_input_byte
+					      = br_state.next_input_byte;
+	      register size_t bytes_in_buffer = br_state.bytes_in_buffer;
+	      if (cinfo->unread_marker == 0) {
+		while (bits_left < MIN_GET_BITS) {
+		  register int c;
+		  if (bytes_in_buffer == 0 ||
+		      (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		    goto label31; }
+		  bytes_in_buffer--; next_input_byte++;
+		  get_buffer = (get_buffer << 8) | c;
+		  bits_left += 8;
+		}
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+	      } else {
+	    label31:
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+		if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left,0)) {
+		  return FALSE; }
+		get_buffer = br_state.get_buffer;
+		bits_left  = br_state.bits_left;
+		if (bits_left < HUFFX_LOOKAHEAD) {
+		  nb = 1; goto label3;
+		}
+	      }
+	    }
+	    look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	    if ((nb = actbl->lookx_nbits[look]) != 0) {
+	      s = actbl->lookx_sym[look];
+	      r = s >> 4; s &= 15;
+	      if (nb <= HUFFX_LOOKAHEAD) {
+		DROP_BITS(nb);
+	      } else {
+		DROP_BITS(HUFFX_LOOKAHEAD);
+		nb -= HUFFX_LOOKAHEAD;
+		CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+		DROP_BITS(nb);
+	      }
+	    } else {
+	      nb = HUFFX_LOOKAHEAD;
+	  label3:
+	      if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,actbl,nb))
+		   < 0) { return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      r = s >> 4; s &= 15;
+	      if (s) {
+		CHECK_BIT_BUFFER(br_state, s, return FALSE);
+		DROP_BITS(s);
+	      }
+	    }
+	  }
 	  if (s) {
 	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
 	  } else {
 	    if (r != 15)
 	      break;
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
 * This file contains declarations for Huffman entropy decoding routines
 * that are shared between the sequential decoder (jdhuff.c) and the
 * progressive decoder (jdphuff.c).  No other modules need to see these.
@@ -21,7 +28,7 @@

 /* Derived data constructed for each Huffman table */

-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+#define HUFFX_LOOKAHEAD	9	/* # of bits of lookahead */

 typedef struct {
  /* Basic tables: (element [0] of each array is unused) */
@@ -36,13 +43,15 @@ typedef struct {
  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
  JHUFF_TBL *pub;

-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+  /* Lookahead tables: indexed by the next HUFFX_LOOKAHEAD bits of
   * the input data stream.  If the next Huffman code is no more
-   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
+   * than HUFFX_LOOKAHEAD-1 bits long, we can obtain its length,
+   * the corresponding symbol, and the encoded coefficient value
+   * directly from these tables.
   */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+  UINT8 lookx_nbits[1<<HUFFX_LOOKAHEAD];  /* # bits, or 0 if too long */
+  INT16 lookx_val[1<<HUFFX_LOOKAHEAD];  /* coefficient value, or unused */
+  UINT8 lookx_sym[1<<HUFFX_LOOKAHEAD];  /* symbol, or unused */
 } d_derived_tbl;

 /* Expand a Huffman table definition into the derived format */
@@ -79,6 +88,21 @@ typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
 * because not all machines measure sizeof in 8-bit bytes.
 */

+#ifdef SLOW_SHIFT_32
+#define MIN_GET_BITS  15	/* minimum allowable value */
+#else
+#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
+#endif
+
+/* On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
+ * of get_buffer to be used.  (On machines with wider words, an even larger
+ * buffer could be used.)  However, on some machines 32-bit shifts are
+ * quite slow and take time proportional to the number of places shifted.
+ * (This is true with most PC compilers, for instance.)  In this case it may
+ * be a win to set MIN_GET_BITS to the minimum value of 15.  This reduces the
+ * average shift distance at the cost of more calls to jpeg_fill_bit_buffer.
+ */
+
 typedef struct {		/* Bitreading state saved across MCUs */
  bit_buf_type get_buffer;	/* current bit-extraction buffer */
  int bits_left;		/* # of unused bits in it */
@@ -109,7 +133,7 @@ typedef struct {		/* Bitreading working state within an MCU */
 	br_state.next_input_byte = cinfop->src->next_input_byte; \
 	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
 	get_buffer = permstate.get_buffer; \
-	bits_left = permstate.bits_left;
+	bits_left = permstate.bits_left

 #define BITREAD_SAVE_STATE(cinfop,permstate)  \
 	cinfop->src->next_input_byte = br_state.next_input_byte; \
@@ -155,47 +179,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
 	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
 	     register int bits_left, int nbits));

-
-/*
- * Code for extracting next Huffman-coded symbol from input bit stream.
- * Again, this is time-critical and we make the main paths be macros.
- *
- * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
- * without looping.  Usually, more than 95% of the Huffman codes will be 8
- * or fewer bits long.  The few overlength codes are handled with a loop,
- * which need not be inline code.
- *
- * Notes about the HUFF_DECODE macro:
- * 1. Near the end of the data segment, we may fail to get enough bits
- *    for a lookahead.  In that case, we do it the hard way.
- * 2. If the lookahead table contains no entry, the next code must be
- *    more than HUFF_LOOKAHEAD bits long.
- * 3. jpeg_huff_decode returns -1 if forced to suspend.
- */
-
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
-  if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-    if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
-    } \
-  } \
-  look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
-    DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
-  } else { \
-    nb = HUFF_LOOKAHEAD+1; \
-slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-	{ failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-  } \
-}
-
 /* Out-of-line case for Huffman code fetching */
 EXTERN(int) jpeg_huff_decode
 	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
 	     register int bits_left, d_derived_tbl * htbl, int min_bits));
+
+
+/*
+ * Figure F.12: extend sign bit.
+ */
+
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file contains code for merged upsampling/color conversion.
 *
 * This file combines functions from jdsample.c and jdcolor.c;
@@ -35,6 +42,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */

 #ifdef UPSAMPLE_MERGING_SUPPORTED

@@ -218,6 +226,17 @@ merged_1v_upsample (j_decompress_ptr cinfo,
 */


+#if RGB_PIXELSIZE == 4
+/* offset of filler byte */
+#define RGB_FILLER  (6 - (RGB_RED) - (RGB_GREEN) - (RGB_BLUE))
+/* byte pattern to fill with */
+#ifdef RGBX_FILLER_0XFF
+#define RGB_FILLER_BYTE 0xFF
+#else
+#define RGB_FILLER_BYTE 0x00
+#endif
+#endif /* RGB_PIXELSIZE == 4 */
+
 /*
 * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
 */
@@ -258,11 +277,17 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
    outptr[RGB_RED] =   range_limit[y + cred];
    outptr[RGB_GREEN] = range_limit[y + cgreen];
    outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr0++);
    outptr[RGB_RED] =   range_limit[y + cred];
    outptr[RGB_GREEN] = range_limit[y + cgreen];
    outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr += RGB_PIXELSIZE;
  }
  /* If image width is odd, do the last output column separately */
@@ -276,6 +301,9 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
    outptr[RGB_RED] =   range_limit[y + cred];
    outptr[RGB_GREEN] = range_limit[y + cgreen];
    outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
  }
 }

@@ -322,21 +350,33 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
    outptr0[RGB_RED] =   range_limit[y + cred];
    outptr0[RGB_GREEN] = range_limit[y + cgreen];
    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr0 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr00++);
    outptr0[RGB_RED] =   range_limit[y + cred];
    outptr0[RGB_GREEN] = range_limit[y + cgreen];
    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr0 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr01++);
    outptr1[RGB_RED] =   range_limit[y + cred];
    outptr1[RGB_GREEN] = range_limit[y + cgreen];
    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr1 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr01++);
    outptr1[RGB_RED] =   range_limit[y + cred];
    outptr1[RGB_GREEN] = range_limit[y + cgreen];
    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    outptr1 += RGB_PIXELSIZE;
  }
  /* If image width is odd, do the last output column separately */
@@ -350,10 +390,16 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
    outptr0[RGB_RED] =   range_limit[y + cred];
    outptr0[RGB_GREEN] = range_limit[y + cgreen];
    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
    y  = GETJSAMPLE(*inptr01);
    outptr1[RGB_RED] =   range_limit[y + cred];
    outptr1[RGB_GREEN] = range_limit[y + cgreen];
    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
  }
 }

@@ -370,6 +416,7 @@ GLOBAL(void)
 jinit_merged_upsampler (j_decompress_ptr cinfo)
 {
  my_upsample_ptr upsample;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  upsample = (my_upsample_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -382,19 +429,73 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)

  if (cinfo->max_v_samp_factor == 2) {
    upsample->pub.upsample = merged_2v_upsample;
-    upsample->upmethod = h2v2_merged_upsample;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2)) {
+      upsample->upmethod = jpeg_h2v2_merged_upsample_sse2;
+    } else
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      upsample->upmethod = jpeg_h2v2_merged_upsample_mmx;
+    } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+    {
+      upsample->upmethod = h2v2_merged_upsample;
+      build_ycc_rgb_table(cinfo);
+    }
    /* Allocate a spare row buffer */
    upsample->spare_row = (JSAMPROW)
      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
  } else {
    upsample->pub.upsample = merged_1v_upsample;
-    upsample->upmethod = h2v1_merged_upsample;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2)) {
+      upsample->upmethod = jpeg_h2v1_merged_upsample_sse2;
+    } else
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      upsample->upmethod = jpeg_h2v1_merged_upsample_mmx;
+    } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+    {
+      upsample->upmethod = h2v1_merged_upsample;
+      build_ycc_rgb_table(cinfo);
+    }
    /* No spare row needed */
    upsample->spare_row = NULL;
  }
-
-  build_ycc_rgb_table(cinfo);
 }

+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_merged_upsampler (j_decompress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 #endif /* UPSAMPLE_MERGING_SUPPORTED */
--- a/jdmermmx.asm
+++ b/jdmermmx.asm
@@ -0,0 +1,981 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef UPSAMPLE_MERGING_SUPPORTED
+%ifdef JDMERGE_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v1_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+EXTN(jpeg_h2v1_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
+
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifndef USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_mmx)
+
+EXTN(jpeg_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, POINTER [cinfo(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; cinfo
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%else  ; USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		10
+%define inptr1		wk(0)-SIZEOF_JSAMPROW	; JSAMPROW inptr1
+%define inptr2		inptr1-SIZEOF_JSAMPROW	; JSAMPROW inptr2
+%define gotptr		inptr2-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_mmx)
+
+EXTN(jpeg_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [inptr2]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, JSAMPROW [esi+(ecx*2+0)*SIZEOF_JSAMPROW]	; inptr00
+	mov	esi, JSAMPROW [esi+(ecx*2+1)*SIZEOF_JSAMPROW]	; inptr01
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+
+	pop	ecx		; col
+	push	eax		; inptr00
+	push	esi		; inptr01
+
+	mov	esi, JSAMPROW [edi+0*SIZEOF_JSAMPROW]		; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]		; outptr1
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq	mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq	mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	mov	JSAMPROW [inptr1], ebx	; inptr1
+	mov	JSAMPROW [inptr2], edx	; inptr2
+	pop	edx			; edx=inptr01
+	pop	ebx			; ebx=inptr00
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	ah,2			; YHctr
+	jmp	short .YHloop_1st
+	alignx	16,7
+
+.YHloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.YHloop_1st:
+	movq	MMWORD [wk(3)], mm0	; wk(3)=(R-Y)(L/H)
+	movq	MMWORD [wk(4)], mm2	; wk(4)=(G-Y)(L/H)
+	movq	MMWORD [wk(5)], mm4	; wk(5)=(B-Y)(L/H)
+
+	movq	mm7, MMWORD [ebx]	; mm7=Y(01234567)
+
+	mov	al,2			; YVctr
+	jmp	short .YVloop_1st
+	alignx	16,7
+
+.YVloop_2nd:
+	movq	mm0, MMWORD [wk(3)]	; mm0=(R-Y)(L/H)
+	movq	mm2, MMWORD [wk(4)]	; mm2=(G-Y)(L/H)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(B-Y)(L/H)
+
+	movq	mm7, MMWORD [edx]	; mm7=Y(01234567)
+	alignx	16,7
+
+.YVloop_1st:
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movq	MMWORD [wk(6)], mmA
+	movq	MMWORD [wk(7)], mmE
+	movq	MMWORD [wk(8)], mmC
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movq	mmH, MMWORD [wk(6)]
+	movq	mmB, MMWORD [wk(7)]
+	movq	mmD, MMWORD [wk(8)]
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmB
+	movq	MMWORD [esi+2*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	near .endcolumn
+
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr0
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr1
+	add	ebx, byte SIZEOF_MMWORD			; inptr00
+	add	edx, byte SIZEOF_MMWORD			; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx			; inptr00
+	push	edx			; inptr01
+	mov	ebx, JSAMPROW [inptr1]	; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]	; edx=inptr2
+	add	ebx, byte SIZEOF_MMWORD	; inptr1
+	add	edx, byte SIZEOF_MMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmB
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmH,mmD
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	esi, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmH,mmB
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmH
+	movd	edx,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [esi+0*SIZEOF_DWORD], eax
+	mov	DWORD [edi+0*SIZEOF_DWORD], edx
+	psrlq	mmH,DWORD_BIT
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmH
+	movd	edx,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	esi, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [esi+0*SIZEOF_WORD], ax
+	mov	WORD [edi+0*SIZEOF_WORD], dx
+	shr	eax,WORD_BIT
+	shr	edx,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	esi, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [esi+0*SIZEOF_BYTE], al
+	mov	BYTE [edi+0*SIZEOF_BYTE], dl
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movq	MMWORD [wk(6)], mmA
+	movq	MMWORD [wk(7)], mmD
+	movq	MMWORD [wk(8)], mmC
+	movq	MMWORD [wk(9)], mmH
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movq	mmE, MMWORD [wk(6)]
+	movq	mmF, MMWORD [wk(7)]
+	movq	mmB, MMWORD [wk(8)]
+	movq	mmG, MMWORD [wk(9)]
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmF
+	movq	MMWORD [esi+2*SIZEOF_MMWORD], mmB
+	movq	MMWORD [esi+3*SIZEOF_MMWORD], mmG
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr0
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr1
+	add	ebx, byte SIZEOF_MMWORD			; inptr00
+	add	edx, byte SIZEOF_MMWORD			; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx			; inptr00
+	push	edx			; inptr01
+	mov	ebx, JSAMPROW [inptr1]	; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]	; edx=inptr2
+	add	ebx, byte SIZEOF_MMWORD	; inptr1
+	add	edx, byte SIZEOF_MMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmF
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmE,mmB
+	movq	mmF,mmG
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	esi, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmE,mmF
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	esi, byte 1*SIZEOF_MMWORD
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [esi+0*SIZEOF_DWORD], mmE
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; !USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+%endif ; JDMERGE_MMX_SUPPORTED
+%endif ; UPSAMPLE_MERGING_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
--- a/jdmerss2.asm
+++ b/jdmerss2.asm
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
 * This file contains Huffman entropy decoding routines for progressive JPEG.
 *
 * Much of the complexity here has to do with supporting input suspension.
@@ -69,6 +76,7 @@ typedef struct {
  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];

  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+  d_derived_tbl * dc_derived_tbls[MAX_COMPS_IN_SCAN];
 } phuff_entropy_decoder;

 typedef phuff_entropy_decoder * phuff_entropy_ptr;
@@ -168,6 +176,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
 	tbl = compptr->dc_tbl_no;
 	jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
 				& entropy->derived_tbls[tbl]);
+	entropy->dc_derived_tbls[ci] = entropy->derived_tbls[tbl];
      }
    } else {
      tbl = compptr->ac_tbl_no;
@@ -193,32 +202,6 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
 }


-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
 /*
 * Check for a restart marker & resynchronize decoder.
 * Returns FALSE if must suspend.
@@ -287,13 +270,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
  int Al = cinfo->Al;
-  register int s, r;
-  int blkn, ci;
-  JBLOCKROW block;
+  int blkn;
  BITREAD_STATE_VARS;
  savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;

  /* Process restart marker if needed; may have to suspend */
  if (cinfo->restart_interval) {
@@ -314,21 +293,67 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
    /* Outer loop handles each block in the MCU */

    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      block = MCU_data[blkn];
-      ci = cinfo->MCU_membership[blkn];
-      compptr = cinfo->cur_comp_info[ci];
-      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
+      JBLOCKROW block = MCU_data[blkn];
+      int ci = cinfo->MCU_membership[blkn];
+      d_derived_tbl * tbl = entropy->dc_derived_tbls[ci];
+      register int s;

      /* Decode a single block's worth of coefficients */

      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
+      {		/* HUFFX_DECODE */
+	register int nb, look, t;
+	if (bits_left < HUFFX_LOOKAHEAD) {
+	  register const JOCTET * next_input_byte = br_state.next_input_byte;
+	  register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	  if (cinfo->unread_marker == 0) {
+	    while (bits_left < MIN_GET_BITS) {
+	      register int c;
+	      if (bytes_in_buffer == 0 ||
+		  (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		goto label11; }
+	      bytes_in_buffer--; next_input_byte++;
+	      get_buffer = (get_buffer << 8) | c;
+	      bits_left += 8;
+	    }
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	  } else {
+	label11:
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	    if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+	      return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      nb = 1; goto label1;
+	    }
+	  }
+	}
+	look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	if ((nb = tbl->lookx_nbits[look]) != 0) {
+	  s = tbl->lookx_val[look];
+	  if (nb <= HUFFX_LOOKAHEAD) {
+	    DROP_BITS(nb);
+	  } else {
+	    DROP_BITS(HUFFX_LOOKAHEAD);
+	    nb -= HUFFX_LOOKAHEAD;
+	    CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	    s += GET_BITS(nb);
+	  }
+	} else {
+	  nb = HUFFX_LOOKAHEAD;
+      label1:
+	  if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+	       < 0) { return FALSE; }
+	  get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	  if (s) {
+	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	    t = GET_BITS(s);
+	    s = HUFF_EXTEND(t, s);
+	  }
+	}
      }
-
      /* Convert DC difference to actual value, update last_dc_val */
      s += state.last_dc_val[ci];
      state.last_dc_val[ci] = s;
@@ -359,11 +384,8 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
  int Se = cinfo->Se;
  int Al = cinfo->Al;
-  register int s, k, r;
  unsigned int EOBRUN;
-  JBLOCKROW block;
  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;

  /* Process restart marker if needed; may have to suspend */
  if (cinfo->restart_interval) {
@@ -384,22 +406,74 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)

    /* There is always only one block per MCU */

-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
+    if (EOBRUN > 0) {		/* if it's a band of zeroes... */
      EOBRUN--;			/* ...process it now (we do nothing) */
-    else {
+    } else {
+      JBLOCKROW block = MCU_data[0];
+      d_derived_tbl * tbl = entropy->ac_derived_tbl;
+      register int s, k, r;
+
+      /* Load up working state */
      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-      block = MCU_data[0];
-      tbl = entropy->ac_derived_tbl;

      for (k = cinfo->Ss; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-	r = s >> 4;
-	s &= 15;
+	{	/* HUFFX_DECODE */
+	  register int nb, look, t;
+	  if (bits_left < HUFFX_LOOKAHEAD) {
+	    register const JOCTET * next_input_byte = br_state.next_input_byte;
+	    register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	    if (cinfo->unread_marker == 0) {
+	      while (bits_left < MIN_GET_BITS) {
+		register int c;
+		if (bytes_in_buffer == 0 ||
+		    (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		  goto label21; }
+		bytes_in_buffer--; next_input_byte++;
+		get_buffer = (get_buffer << 8) | c;
+		bits_left += 8;
+	      }
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	    } else {
+	  label21:
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	      if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+		return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      if (bits_left < HUFFX_LOOKAHEAD) {
+		nb = 1; goto label2;
+	      }
+	    }
+	  }
+	  look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	  if ((nb = tbl->lookx_nbits[look]) != 0) {
+	    s = tbl->lookx_val[look];
+	    r = tbl->lookx_sym[look] >> 4;
+	    if (nb <= HUFFX_LOOKAHEAD) {
+	      DROP_BITS(nb);
+	    } else {
+	      DROP_BITS(HUFFX_LOOKAHEAD);
+	      nb -= HUFFX_LOOKAHEAD;
+	      CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	      s += GET_BITS(nb);
+	    }
+	  } else {
+	    nb = HUFFX_LOOKAHEAD;
+	label2:
+	    if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+		 < 0) { return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    r = s >> 4; s &= 15;
+	    if (s) {
+	      CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	      t = GET_BITS(s);
+	      s = HUFF_EXTEND(t, s);
+	    }
+	  }
+	}
 	if (s) {
 	  k += r;
-	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	  r = GET_BITS(s);
-	  s = HUFF_EXTEND(r, s);
 	  /* Scale and output coefficient in natural (dezigzagged) order */
 	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
 	} else {
@@ -444,7 +518,6 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
  int blkn;
-  JBLOCKROW block;
  BITREAD_STATE_VARS;

  /* Process restart marker if needed; may have to suspend */
@@ -464,7 +537,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  /* Outer loop handles each block in the MCU */

  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
+    JBLOCKROW block = MCU_data[blkn];

    /* Encoded data is simply the next bit of the two's-complement DC value */
    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
@@ -492,14 +565,14 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
  int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+  int Al = cinfo->Al;
  register int s, k, r;
  unsigned int EOBRUN;
  JBLOCKROW block;
  JCOEFPTR thiscoef;
  BITREAD_STATE_VARS;
  d_derived_tbl * tbl;
+  int pm1[2];
  int num_newnz;
  int newnz_pos[DCTSIZE2];

@@ -522,6 +595,13 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
    block = MCU_data[0];
    tbl = entropy->ac_derived_tbl;

+    /* The pm1[] array is indexed by a value from relational operator.
+     * This method eliminates conditional branches depending on random data,
+     * which result in lower performance on recent processors.
+     */
+    pm1[0] =   1  << cinfo->Al;	/* +1 in the bit position being coded */
+    pm1[1] = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
    /* If we are forced to suspend, we must undo the assignments to any newly
     * nonzero coefficients in the block, because otherwise we'd get confused
     * next time about which coefficients were already nonzero.
@@ -535,18 +615,63 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)

    if (EOBRUN == 0) {
      for (; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  if (s != 1)		/* size of new coef should always be 1 */
-	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1))
-	    s = p1;		/* newly nonzero coef is positive */
-	  else
-	    s = m1;		/* newly nonzero coef is negative */
-	} else {
+	{	/* HUFFX_DECODE */
+	  register int nb, look, t;
+	  if (bits_left < HUFFX_LOOKAHEAD) {
+	    register const JOCTET * next_input_byte = br_state.next_input_byte;
+	    register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	    if (cinfo->unread_marker == 0) {
+	      while (bits_left < MIN_GET_BITS) {
+		register int c;
+		if (bytes_in_buffer == 0 ||
+		    (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		  goto label31; }
+		bytes_in_buffer--; next_input_byte++;
+		get_buffer = (get_buffer << 8) | c;
+		bits_left += 8;
+	      }
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	    } else {
+	  label31:
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	      if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+		goto undoit; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      if (bits_left < HUFFX_LOOKAHEAD) {
+		nb = 1; goto label3;
+	      }
+	    }
+	  }
+	  look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	  if ((nb = tbl->lookx_nbits[look]) != 0) {
+	    t = tbl->lookx_sym[look];
+	    s = tbl->lookx_val[look];
+	    r = t >> 4; t &= 15;
+	    if (t <= 1) {
+	      DROP_BITS(nb);
+	    } else {		  /* size of new coef should always be 1 */
+	      WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	      DROP_BITS(nb - (t - 1));
+	      s = (s >= 0) ? 1 : -1;
+	    }
+	  } else {
+	    nb = HUFFX_LOOKAHEAD;
+	label3:
+	    if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+		 < 0) { goto undoit; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    r = s >> 4; s &= 15;
+	    if (s) {
+	      if (s != 1)	    /* size of new coef should always be 1 */
+		WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	      CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	      s = GET_BITS(1) ? 1 : -1;
+	    }
+	  }
+	}
+	if (s == 0) {
 	  if (r != 15) {
 	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
 	    if (r) {
@@ -567,12 +692,8 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	  if (*thiscoef != 0) {
 	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
 	    if (GET_BITS(1)) {
-	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-		if (*thiscoef >= 0)
-		  *thiscoef += p1;
-		else
-		  *thiscoef += m1;
-	      }
+	      if ((*thiscoef & pm1[0]) == 0) /* do nothing if already set it */
+		*thiscoef += pm1[(*thiscoef < 0)];
 	    }
 	  } else {
 	    if (--r < 0)
@@ -583,7 +704,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	if (s) {
 	  int pos = jpeg_natural_order[k];
 	  /* Output newly nonzero coefficient */
-	  (*block)[pos] = (JCOEF) s;
+	  (*block)[pos] = (JCOEF) (s << Al);
 	  /* Remember its position in case we have to suspend */
 	  newnz_pos[num_newnz++] = pos;
 	}
@@ -601,12 +722,8 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 	if (*thiscoef != 0) {
 	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
 	  if (GET_BITS(1)) {
-	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-	      if (*thiscoef >= 0)
-		*thiscoef += p1;
-	      else
-		*thiscoef += m1;
-	    }
+	    if ((*thiscoef & pm1[0]) == 0)  /* do nothing if already set it */
+	      *thiscoef += pm1[(*thiscoef < 0)];
 	  }
 	}
      }
--- a/jdsammmx.asm
+++ b/jdsammmx.asm
@@ -0,0 +1,893 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE		times 4 dw  1
+PW_TWO		times 4 dw  2
+PW_THREE	times 4 dw  3
+PW_SEVEN	times 4 dw  7
+PW_EIGHT	times 4 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jpeg_h2v1_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_fancy_upsample_mmx)
+
+EXTN(jpeg_h2v1_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	mm0,mm0			; mm0=(all 0's)
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	mm6,mm6
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2,mm1
+	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
+	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
+	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
+
+	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
+	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
+
+	movq	mm7,mm1
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
+	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
+	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
+	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
+	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
+
+	psllw	mm3,BYTE_BIT
+	psllw	mm6,BYTE_BIT
+	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h2v2_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_fancy_upsample_mmx)
+
+EXTN(jpeg_h2v2_fancy_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, POINTER [compptr(edx)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(edx)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
+	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
+	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+	pand	mm1,mm7			; mm1=( 0 - - -)
+	pand	mm2,mm7			; mm2=( 0 - - -)
+
+	movq	MMWORD [wk(0)], mm1
+	movq	MMWORD [wk(1)], mm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	mm1,mm1
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+	movq	mm2,mm1
+
+	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
+	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
+	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
+	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+.upsample:
+	; -- process the upper row
+
+	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
+	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
+
+	movq	mm0,mm7
+	movq	mm4,mm3
+	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
+	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
+	movq	mm5,mm7
+	movq	mm6,mm3
+	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
+	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
+
+	por	mm0,mm4				; mm0=( 1 2 3 4)
+	por	mm5,mm6				; mm5=( 3 4 5 6)
+
+	movq	mm1,mm7
+	movq	mm2,mm3
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
+	movq	mm4,mm3
+	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
+
+	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
+	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
+
+	movq	MMWORD [wk(0)], mm4
+
+	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm7
+	paddw	mm5,mm3
+	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
+	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
+	paddw	mm0,mm7
+	paddw	mm2,mm3
+	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
+	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
+
+	psllw	mm0,BYTE_BIT
+	psllw	mm2,BYTE_BIT
+	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+	; -- process the lower row
+
+	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
+	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
+
+	movq	mm7,mm6
+	movq	mm3,mm4
+	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
+	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
+	movq	mm0,mm6
+	movq	mm2,mm4
+	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
+	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
+
+	por	mm7,mm3				; mm7=( 1 2 3 4)
+	por	mm0,mm2				; mm0=( 3 4 5 6)
+
+	movq	mm1,mm6
+	movq	mm5,mm4
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
+	movq	mm3,mm4
+	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
+
+	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
+	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
+
+	movq	MMWORD [wk(1)], mm3
+
+	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm6
+	paddw	mm0,mm4
+	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
+	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
+	paddw	mm7,mm6
+	paddw	mm5,mm4
+	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
+	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
+
+	psllw	mm7,BYTE_BIT
+	psllw	mm5,BYTE_BIT
+	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifdef UPSAMPLE_H1V2_SUPPORTED
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h1v2_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define gotptr		ebp-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h1v2_fancy_upsample_mmx)
+
+EXTN(jpeg_h1v2_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	pxor	mm0,mm0			; mm0=(all 0's)
+	alignx	16,7
+
+.columnloop:
+	movq	mm1, MMWORD [ebx]	; mm1=row[ 0]( 0 1 2 3 4 5 6 7)
+	movq	mm2, MMWORD [ecx]	; mm2=row[-1]( 0 1 2 3 4 5 6 7)
+	movq	mm3, MMWORD [esi]	; mm3=row[+1]( 0 1 2 3 4 5 6 7)
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=row[ 0]( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=row[ 0]( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=row[-1]( 0 1 2 3)
+	punpckhbw mm5,mm0		; mm5=row[-1]( 4 5 6 7)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=row[+1]( 0 1 2 3)
+	punpckhbw mm6,mm0		; mm6=row[+1]( 4 5 6 7)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=Out0L=( 0 1 2 3)
+	psrlw	mm5,2			; mm5=Out0H=( 4 5 6 7)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=Out1L=( 0 1 2 3)
+	psrlw	mm6,2			; mm6=Out1H=( 4 5 6 7)
+
+	packuswb  mm2,mm5		; mm2=Out0=( 0 1 2 3 4 5 6 7)
+	packuswb  mm3,mm6		; mm3=Out1=( 0 1 2 3 4 5 6 7)
+
+	movq	MMWORD [edx], mm2
+	movq	MMWORD [edi], mm3
+
+	poppic	ebx
+
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 1*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr1
+	sub	eax, byte SIZEOF_MMWORD
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	poppic	eax		; remove gotptr
+	pop	ebp
+	ret
+
+%endif ; UPSAMPLE_H1V2_SUPPORTED
+%endif ; JDSAMPLE_FANCY_MMX_SUPPORTED
+
+%ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+
+%ifndef JDSAMPLE_FANCY_MMX_SUPPORTED
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+%endif
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v1_upsample_mmx (j_decompress_ptr cinfo,
+;                         jpeg_component_info * compptr,
+;                         JSAMPARRAY input_data,
+;                         JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_upsample_mmx)
+
+EXTN(jpeg_h2v1_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	short .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v2_upsample_mmx (j_decompress_ptr cinfo,
+;                         jpeg_component_info * compptr,
+;                         JSAMPARRAY input_data,
+;                         JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_upsample_mmx)
+
+EXTN(jpeg_h2v2_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JDSAMPLE_SIMPLE_MMX_SUPPORTED
--- a/jdsample.c
+++ b/jdsample.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file contains upsampling routines.
 *
 * Upsampling input data is counted in "row groups".  A row group
@@ -21,6 +28,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */


 /* Pointer to routine to upsample a single component */
@@ -285,6 +293,37 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 }


+#ifdef UPSAMPLE_H1V2_SUPPORTED
+
+/*
+ * Fast processing for the common case of 1:1 horizontal and 2:1 vertical.
+ * It's still a box filter.
+ *
+ * SIMD Ext: This routine is for files that are rotated or transposed
+ *           by jpegtran.
+ */
+
+METHODDEF(void)
+h1v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  int inrow, outrow;
+
+  inrow = outrow = 0;
+  while (outrow < cinfo->max_v_samp_factor) {
+    jcopy_sample_rows(input_data, inrow, output_data, outrow,
+		      1, cinfo->output_width);
+    jcopy_sample_rows(input_data, inrow, output_data, outrow+1,
+		      1, cinfo->output_width);
+    inrow++;
+    outrow += 2;
+  }
+}
+
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
+
+
 /*
 * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
 *
@@ -391,6 +430,52 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 }


+#ifdef UPSAMPLE_H1V2_SUPPORTED
+
+/*
+ * Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+ * Again a triangle filter; see comments for h2v1 case, above.
+ *
+ * It is OK for us to reference the adjacent input rows because we demanded
+ * context from the main buffer controller (see initialization code).
+ *
+ * SIMD Ext: This routine is for files that are rotated or transposed
+ *           by jpegtran.
+ */
+
+METHODDEF(void)
+h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  register JSAMPROW inptr0, inptr1, outptr;
+  register int colsum;
+  register JDIMENSION colctr;
+  int inrow, outrow, v;
+
+  inrow = outrow = 0;
+  while (outrow < cinfo->max_v_samp_factor) {
+    for (v = 0; v < 2; v++) {
+      /* inptr0 points to nearest input row, inptr1 points to next nearest */
+      inptr0 = input_data[inrow];
+      if (v == 0)		/* next nearest is row above */
+	inptr1 = input_data[inrow-1];
+      else			/* next nearest is row below */
+	inptr1 = input_data[inrow+1];
+      outptr = output_data[outrow++];
+
+      for (colctr = compptr->downsampled_width; colctr > 0; colctr--) {
+	colsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+	*outptr++ = (JSAMPLE) ((colsum + v + 1) >> 2);
+      }
+    }
+    inrow++;
+  }
+}
+
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
+
+
 /*
 * Module initialization routine for upsampling.
 */
@@ -403,6 +488,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
  jpeg_component_info * compptr;
  boolean need_buffer, do_fancy;
  int h_in_group, v_in_group, h_out_group, v_out_group;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);

  upsample = (my_upsample_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -447,18 +533,83 @@ jinit_upsampler (j_decompress_ptr cinfo)
    } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      if (do_fancy && compptr->downsampled_width > 2) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h2v1_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v1_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v1_fancy_upsample;
+      } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2)
+	  upsample->methods[ci] = jpeg_h2v1_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v1_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v1_upsample;
+      }
    } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
      /* Special cases for 2h2v upsampling */
      if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h2v2_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v2_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v2_fancy_upsample;
+	upsample->pub.need_context_rows = TRUE;
+      } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2)
+	  upsample->methods[ci] = jpeg_h2v2_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v2_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v2_upsample;
+      }
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+    } else if (h_in_group == h_out_group &&
+	       v_in_group * 2 == v_out_group) {
+      /* Special cases for 1h2v upsampling */
+      if (do_fancy) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h1v2_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h1v2_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h1v2_fancy_upsample;
 	upsample->pub.need_context_rows = TRUE;
      } else
-	upsample->methods[ci] = h2v2_upsample;
+	upsample->methods[ci] = h1v2_upsample;
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
    } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
      /* Generic integral-factors upsampling method */
@@ -468,11 +619,52 @@ jinit_upsampler (j_decompress_ptr cinfo)
    } else
      ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
    if (need_buffer) {
+      enum { SIZEOF_XMMWORD = 16 };	/* from jsimdext.inc */
      upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) jround_up((long) cinfo->output_width,
-				(long) cinfo->max_h_samp_factor),
+	 (JDIMENSION) jround_up(jround_up((long) cinfo->output_width,
+					  (long) cinfo->max_h_samp_factor),
+				(long) (2 * SIZEOF_XMMWORD)),
 	 (JDIMENSION) cinfo->max_v_samp_factor);
    }
  }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_upsampler (j_decompress_ptr cinfo, int do_fancy)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+  if (!do_fancy)
+    return jpeg_simd_merged_upsampler(cinfo);
+#endif
+
+  if (do_fancy) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+  } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2)
+      return JSIMD_SSE2;
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+  }
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
--- a/jdsamss2.asm
+++ b/jdsamss2.asm
@@ -0,0 +1,883 @@
+;
+; jdsamss2.asm - upsampling (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jpeg_h2v1_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_fancy_upsample_sse2)
+
+EXTN(jpeg_h2v1_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h2v2_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_fancy_upsample_sse2)
+
+EXTN(jpeg_h2v2_fancy_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, POINTER [compptr(edx)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(edx)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifdef UPSAMPLE_H1V2_SUPPORTED
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h1v2_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define gotptr		ebp-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h1v2_fancy_upsample_sse2)
+
+EXTN(jpeg_h1v2_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm1, XMMWORD [ebx]	; xmm1=row[ 0]( 0  1  2 ... 13 14 15)
+	movdqa	xmm2, XMMWORD [ecx]	; xmm2=row[-1]( 0  1  2 ... 13 14 15)
+	movdqa	xmm3, XMMWORD [esi]	; xmm3=row[+1]( 0  1  2 ... 13 14 15)
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm0		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm0		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=Out0L=( 0  1  2  3  4  5  6  7)
+	psrlw	xmm5,2			; xmm5=Out0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=Out1L=( 0  1  2  3  4  5  6  7)
+	psrlw	xmm6,2			; xmm6=Out1H=( 8  9 10 11 12 13 14 15)
+
+	packuswb  xmm2,xmm5		; xmm2=Out0=( 0  1  2 ... 13 14 15)
+	packuswb  xmm3,xmm6		; xmm3=Out1=( 0  1  2 ... 13 14 15)
+
+	movdqa	XMMWORD [edx], xmm2
+	movdqa	XMMWORD [edi], xmm3
+
+	poppic	ebx
+
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 1*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr1
+	sub	eax, byte SIZEOF_XMMWORD
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	poppic	eax		; remove gotptr
+	pop	ebp
+	ret
+
+%endif ; UPSAMPLE_H1V2_SUPPORTED
+%endif ; JDSAMPLE_FANCY_SSE2_SUPPORTED
+
+%ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+
+%ifndef JDSAMPLE_FANCY_SSE2_SUPPORTED
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+%endif
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v1_upsample_sse2 (j_decompress_ptr cinfo,
+;                          jpeg_component_info * compptr,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_upsample_sse2)
+
+EXTN(jpeg_h2v1_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	short .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v2_upsample_sse2 (j_decompress_ptr cinfo,
+;                          jpeg_component_info * compptr,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_upsample_sse2)
+
+EXTN(jpeg_h2v2_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JDSAMPLE_SIMPLE_SSE2_SUPPORTED
--- a/jf3dnflt.asm
+++ b/jf3dnflt.asm
@@ -0,0 +1,327 @@
+;
+; jf3dnflt.asm - floating-point FDCT (3DNow!)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382	times 2 dd  0.382683432365089771728460
+PD_0_707	times 2 dd  0.707106781186547524400844
+PD_0_541	times 2 dd  0.541196100146196984399723
+PD_1_306	times 2 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float_3dnow (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_float_3dnow)
+
+EXTN(jpeg_fdct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 10)=data0
+	punpckhdq mm4,mm1		; mm4=(01 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(06 16)=data6
+	punpckhdq mm5,mm3		; mm5=(07 17)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(02 12)=data2
+	punpckhdq mm4,mm3		; mm4=(03 13)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(04 14)=data4
+	punpckhdq mm0,mm5		; mm0=(05 15)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 01)=data0
+	punpckhdq mm4,mm1		; mm4=(10 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(60 61)=data6
+	punpckhdq mm5,mm3		; mm5=(70 71)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(20 21)=data2
+	punpckhdq mm4,mm3		; mm4=(30 31)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(40 41)=data4
+	punpckhdq mm0,mm5		; mm0=(50 51)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	femms		; empty MMX/3DNow! state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jfdctflt.asm
+++ b/jfdctflt.asm
@@ -0,0 +1,288 @@
+;
+; jfdctflt.asm - floating-point FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+%define ROTATOR_TYPE	FP32	; float
+
+	alignz	16
+	global	EXTN(jconst_fdct_float)
+
+EXTN(jconst_fdct_float):
+
+F_0_382	dd	0.382683432365089771728460	; cos(PI*3/8)
+F_0_707	dd	0.707106781186547524400844	; cos(PI*1/4)
+F_0_541	dd	0.541196100146196984399723	; cos(PI*1/8)-cos(PI*3/8)
+F_1_306	dd	1.306562964876376527856643	; cos(PI*1/8)+cos(PI*3/8)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float (FAST_FLOAT * data)
+;
+
+%define data(b)	(b)+8		; FAST_FLOAT * data
+
+	align	16
+	global	EXTN(jpeg_fdct_float)
+
+EXTN(jpeg_fdct_float):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.rowloop:
+	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Even part
+
+	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fadd	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+
+	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
+	fsub	st0,st4
+	fxch	st0,st3
+	faddp	st4,st0
+	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st2
+	faddp	st1,st0
+
+	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+
+	fstp	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Odd part
+
+	fadd	st2,st0
+	fadd	st0,st1
+	fxch	st0,st3
+	fadd	st1,st0
+	fxch	st0,st3
+
+	fld	st2
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+	fxch	st0,st1
+	fsub	st0,st2
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
+	fxch	st0,st2
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
+	fxch	st0,st2
+	fadd	st3,st0
+	faddp	st2,st0
+
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
+	fsubr	st0,st1
+	fxch	st0,st3
+	faddp	st1,st0
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+
+	fstp	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+
+	add	edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx				; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.columnloop:
+	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Even part
+
+	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fadd	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+
+	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
+	fsub	st0,st4
+	fxch	st0,st3
+	faddp	st4,st0
+	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st2
+	faddp	st1,st0
+
+	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+
+	fstp	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Odd part
+
+	fadd	st2,st0
+	fadd	st0,st1
+	fxch	st0,st3
+	fadd	st1,st0
+	fxch	st0,st3
+
+	fld	st2
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+	fxch	st0,st1
+	fsub	st0,st2
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
+	fxch	st0,st2
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
+	fxch	st0,st2
+	fadd	st3,st0
+	faddp	st2,st0
+
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
+	fsubr	st0,st1
+	fxch	st0,st3
+	faddp	st1,st0
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+
+	fstp	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+
+	add	edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jfdctfst.asm
+++ b/jfdctfst.asm
@@ -0,0 +1,303 @@
+;
+; jfdctfst.asm - fast integer FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctfst.c; see the jfdctfst.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; We can gain a little more speed, with a further compromise in accuracy,
+; by omitting the addition in a descaling shift.  This yields an
+; incorrectly rounded result half the time...
+;
+%macro	descale 2
+%ifdef USE_ACCURATE_ROUNDING
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast (DCTELEM * data)
+;
+
+%define data(b)	(b)+8		; DCTELEM * data
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast)
+
+EXTN(jpeg_fdct_ifast):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process rows.
+
+	mov	ecx, DCTSIZE
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	alignx	16,7
+.rowloop:
+	push	ecx		; ctr
+	push	edx		; dataptr
+
+	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	eax
+
+	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	eax
+
+	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	mov	edx, POINTER [esp+16]	; dataptr
+
+	add	edi,esi
+	imul	edi,(F_0_707)	; edi=z1
+	descale	edi,CONST_BITS
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[esi+edi]	; ecx=data2
+	sub	esi,edi		; esi=data6
+	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], si
+
+	; -- Odd part
+
+	pop	eax	; eax=tmp4
+	pop	edx	; edx=tmp5
+	pop	ebx	; ebx=tmp6
+	pop	edi	; edi=tmp7
+
+	add	eax,edx		; eax=tmp10
+	add	edx,ebx		; edx=tmp11
+	add	ebx,edi		; ebx=tmp12, edi=tmp7
+
+	imul	edx,(F_0_707)	; edx=z3
+	descale	edx,CONST_BITS
+	lea	esi,[edi+edx]	; esi=z11
+	sub	edi,edx		; edi=z13
+
+	mov	ecx,eax		; ecx=tmp10
+	sub	eax,ebx
+	imul	eax,(F_0_382)	; eax=z5
+	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
+	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
+	descale	eax,CONST_BITS
+	descale	ecx,CONST_BITS
+	descale	ebx,CONST_BITS
+	add	ecx,eax		; ecx=z2
+	add	ebx,eax		; ebx=z4
+
+	pop	edx		; dataptr
+
+	lea	eax,[edi+ecx]	; eax=data5
+	sub	edi,ecx		; edi=data3
+	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], ax
+	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], di
+
+	lea	ecx,[esi+ebx]	; ecx=data1
+	sub	esi,ebx		; esi=data7
+	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], si
+
+	pop	ecx		; ctr
+
+	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx			; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	ecx, DCTSIZE
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	alignx	16,7
+.columnloop:
+	push	ecx		; ctr
+	push	edx		; dataptr
+
+	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	eax
+
+	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	eax
+
+	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	mov	edx, POINTER [esp+16]	; dataptr
+
+	add	edi,esi
+	imul	edi,(F_0_707)	; edi=z1
+	descale	edi,CONST_BITS
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[esi+edi]	; ecx=data2
+	sub	esi,edi		; esi=data6
+	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], si
+
+	; -- Odd part
+
+	pop	eax	; eax=tmp4
+	pop	edx	; edx=tmp5
+	pop	ebx	; ebx=tmp6
+	pop	edi	; edi=tmp7
+
+	add	eax,edx		; eax=tmp10
+	add	edx,ebx		; edx=tmp11
+	add	ebx,edi		; ebx=tmp12, edi=tmp7
+
+	imul	edx,(F_0_707)	; edx=z3
+	descale	edx,CONST_BITS
+	lea	esi,[edi+edx]	; esi=z11
+	sub	edi,edx		; edi=z13
+
+	mov	ecx,eax		; ecx=tmp10
+	sub	eax,ebx
+	imul	eax,(F_0_382)	; eax=z5
+	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
+	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
+	descale	eax,CONST_BITS
+	descale	ecx,CONST_BITS
+	descale	ebx,CONST_BITS
+	add	ecx,eax		; ecx=z2
+	add	ebx,eax		; ebx=z4
+
+	pop	edx		; dataptr
+
+	lea	eax,[edi+ecx]	; eax=data5
+	sub	edi,ecx		; edi=data3
+	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], ax
+	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], di
+
+	lea	ecx,[esi+ebx]	; ecx=data1
+	sub	esi,ebx		; esi=data7
+	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], si
+
+	pop	ecx		; ctr
+
+	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_IFAST_SUPPORTED
--- a/jfdctint.asm
+++ b/jfdctint.asm
@@ -0,0 +1,342 @@
+;
+; jfdctint.asm - accurate integer FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow (DCTELEM * data)
+;
+
+%define data(b)	(b)+8		; DCTELEM * data
+
+	align	16
+	global	EXTN(jpeg_fdct_islow)
+
+EXTN(jpeg_fdct_islow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.rowloop:
+	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	ecx		; ctr
+	push	eax
+
+	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	edx		; dataptr
+	push	eax
+
+	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	edx, POINTER [esp+8]	; dataptr
+	sal	ebx, PASS1_BITS
+	sal	eax, PASS1_BITS
+	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[edi+esi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
+	add	esi,ecx		; esi=data2
+	add	edi,ecx		; edi=data6
+	descale	esi,(CONST_BITS-PASS1_BITS)
+	descale	edi,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], si
+	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], di
+
+	; -- Odd part
+
+	mov	eax, INT32 [esp]	; eax=tmp4
+	mov	ebx, INT32 [esp+4]	; ebx=tmp5
+	mov	ecx, INT32 [esp+12]	; ecx=tmp6
+	mov	esi, INT32 [esp+16]	; esi=tmp7
+
+	lea	edx,[eax+ecx]	; edx=z3
+	lea	edi,[ebx+esi]	; edi=z4
+	add	eax,esi		; eax=z1
+	add	ebx,ecx		; ebx=z2
+
+	lea	esi,[edx+edi]
+	imul	esi,(F_1_175)	; esi=z5
+
+	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	edx,esi		; edx=z3(=z3+z5)
+	add	edi,esi		; edi=z4(=z4+z5)
+
+	lea	ecx,[eax+edx]	; ecx=z1+z3
+	lea	esi,[ebx+edi]	; esi=z2+z4
+	add	eax,edi		; eax=z1+z4
+	add	ebx,edx		; ebx=z2+z3
+
+	pop	edx		; edx=tmp4
+	pop	edi		; edi=tmp5
+	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
+	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
+	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
+	add	esi,edi		; esi=data5(=tmp5+z2+z4)
+	pop	edx		; dataptr
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	esi,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], si
+
+	pop	edi		; edi=tmp6
+	pop	ecx		; ecx=tmp7
+	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
+	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
+	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
+	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
+	pop	ecx		; ctr
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], ax
+
+	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx			; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.columnloop:
+	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	ecx		; ctr
+	push	eax
+
+	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	edx		; dataptr
+	push	eax
+
+	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	edx, POINTER [esp+8]	; dataptr
+	descale	ebx, PASS1_BITS
+	descale	eax, PASS1_BITS
+	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[edi+esi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
+	add	esi,ecx		; esi=data2
+	add	edi,ecx		; edi=data6
+	descale	esi,(CONST_BITS+PASS1_BITS)
+	descale	edi,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], si
+	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], di
+
+	; -- Odd part
+
+	mov	eax, INT32 [esp]	; eax=tmp4
+	mov	ebx, INT32 [esp+4]	; ebx=tmp5
+	mov	ecx, INT32 [esp+12]	; ecx=tmp6
+	mov	esi, INT32 [esp+16]	; esi=tmp7
+
+	lea	edx,[eax+ecx]	; edx=z3
+	lea	edi,[ebx+esi]	; edi=z4
+	add	eax,esi		; eax=z1
+	add	ebx,ecx		; ebx=z2
+
+	lea	esi,[edx+edi]
+	imul	esi,(F_1_175)	; esi=z5
+
+	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	edx,esi		; edx=z3(=z3+z5)
+	add	edi,esi		; edi=z4(=z4+z5)
+
+	lea	ecx,[eax+edx]	; ecx=z1+z3
+	lea	esi,[ebx+edi]	; esi=z2+z4
+	add	eax,edi		; eax=z1+z4
+	add	ebx,edx		; ebx=z2+z3
+
+	pop	edx		; edx=tmp4
+	pop	edi		; edi=tmp5
+	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
+	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
+	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
+	add	esi,edi		; esi=data5(=tmp5+z2+z4)
+	pop	edx		; dataptr
+	descale	ecx,(CONST_BITS+PASS1_BITS)
+	descale	esi,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], si
+
+	pop	edi		; edi=tmp6
+	pop	ecx		; ecx=tmp7
+	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
+	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
+	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
+	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
+	pop	ecx		; ctr
+	descale	ebx,(CONST_BITS+PASS1_BITS)
+	descale	eax,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], ax
+
+	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jfmmxfst.asm
+++ b/jfmmxfst.asm
@@ -0,0 +1,404 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast_mmx)
+
+EXTN(jpeg_fdct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_MMX_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
--- a/jfmmxint.asm
+++ b/jfmmxint.asm
@@ -0,0 +1,629 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_islow_mmx)
+
+EXTN(jpeg_fdct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	psllw	mm5,PASS1_BITS		; mm5=data0
+	psllw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm1,DESCALE_P1
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm2,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm3,DESCALE_P1
+	psrad	mm5,DESCALE_P1
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	mm5,PASS1_BITS		; mm5=data0
+	psraw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm6,DESCALE_P2
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm2,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm3,DESCALE_P2
+	psrad	mm5,DESCALE_P2
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_MMX_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jfss2fst.asm
+++ b/jfss2fst.asm
@@ -0,0 +1,411 @@
+;
+; jfss2fst.asm - fast integer FDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast_sse2)
+
+EXTN(jpeg_fdct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
--- a/jfss2int.asm
+++ b/jfss2int.asm
@@ -0,0 +1,641 @@
+;
+; jfss2int.asm - accurate integer FDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jpeg_fdct_islow_sse2)
+
+EXTN(jpeg_fdct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jfsseflt.asm
+++ b/jfsseflt.asm
@@ -0,0 +1,383 @@
+;
+; jfsseflt.asm - floating-point FDCT (SSE)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+%define JFDCT_FLT_SSE_SUPPORTED
+%endif
+%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+%define JFDCT_FLT_SSE_SUPPORTED
+%endif
+%ifdef JFDCT_FLT_SSE_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float_sse (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_float_sse)
+
+EXTN(jpeg_fdct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, byte 4*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/ji3dnflt.asm
+++ b/ji3dnflt.asm
@@ -0,0 +1,462 @@
+;
+; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414	times 2 dd  1.414213562373095048801689
+PD_1_847	times 2 dd  1.847759065022573512256366
+PD_1_082	times 2 dd  1.082392200292393968799446
+PD_2_613	times 2 dd  2.613125929752753055713286
+PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_3dnow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                        JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_3dnow)
+
+EXTN(jpeg_idct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	pushpic	ebx		; save GOT address
+	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	or	eax,ebx
+	poppic	ebx		; restore GOT address
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0
+	punpckhdq mm1,mm1
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	punpcklwd mm1,mm1
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+	pi2fd     mm1,mm1
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm5,mm5
+	punpcklwd mm1,mm1
+	psrad     mm5,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm5,mm5
+	pi2fd     mm1,mm1
+
+	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 01)
+	pfadd	mm7,mm3			; mm7=data1=(10 11)
+	pfsub	mm5,mm1			; mm5=data7=(70 71)
+	pfsub	mm0,mm3			; mm0=data6=(60 61)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq      mm1,mm6		; transpose coefficients
+	punpckldq mm6,mm7		; mm6=(00 10)
+	punpckhdq mm1,mm7		; mm1=(01 11)
+	movq      mm3,mm0		; transpose coefficients
+	punpckldq mm0,mm5		; mm0=(60 70)
+	punpckhdq mm3,mm5		; mm3=(61 71)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm6,mm7
+	movq	mm1,mm5
+	pfadd	mm7,mm2			; mm7=data2=(20 21)
+	pfadd	mm5,mm4			; mm5=data4=(40 41)
+	pfsub	mm6,mm2			; mm6=data5=(50 51)
+	pfsub	mm1,mm4			; mm1=data3=(30 31)
+
+	movq      mm0,mm7		; transpose coefficients
+	punpckldq mm7,mm1		; mm7=(20 30)
+	punpckhdq mm0,mm1		; mm0=(21 31)
+	movq      mm3,mm5		; transpose coefficients
+	punpckldq mm5,mm6		; mm5=(40 50)
+	punpckhdq mm3,mm6		; mm3=(41 51)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 10)
+	pfadd	mm7,mm3			; mm7=data1=(01 11)
+	pfsub	mm5,mm1			; mm5=data7=(07 17)
+	pfsub	mm0,mm3			; mm0=data6=(06 16)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm3,mm3
+	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
+	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
+	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
+	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
+
+	pand	mm6,mm3			; mm6=(00 -- 10 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
+	pand	mm0,mm3			; mm0=(06 -- 16 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
+	por	mm6,mm7			; mm6=(00 01 10 11)
+	por	mm0,mm5			; mm0=(06 07 16 17)
+
+	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
+	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm7,mm1
+	movq	mm5,mm3
+	pfadd	mm1,mm2			; mm1=data2=(02 12)
+	pfadd	mm3,mm4			; mm3=data4=(04 14)
+	pfsub	mm7,mm2			; mm7=data5=(05 15)
+	pfsub	mm5,mm4			; mm5=data3=(03 13)
+
+	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm4,mm4
+	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
+	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
+	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
+	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
+
+	pand	mm3,mm4			; mm3=(04 -- 14 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
+	pand	mm1,mm4			; mm1=(02 -- 12 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
+	por	mm3,mm7			; mm3=(04 05 14 15)
+	por	mm1,mm5			; mm1=(02 03 12 13)
+
+	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
+
+	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
+	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
+	paddb     mm6,mm2
+	paddb     mm1,mm2
+
+	movq      mm4,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 2*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jidctflt.asm
+++ b/jidctflt.asm
@@ -0,0 +1,473 @@
+;
+; jidctflt.asm - floating-point IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+%define ROTATOR_TYPE	FP32	; float
+
+	alignz	16
+	global	EXTN(jconst_idct_float)
+
+EXTN(jconst_idct_float):
+
+F_1_414	dd	1.414213562373095048801689	; 2*cos(PI*1/4)
+F_1_847	dd	1.847759065022573512256366	; 2*cos(PI*1/8)
+F_1_082	dd	1.082392200292393968799446	; 2*(cos(PI*1/8)-cos(PI*3/8))
+F_2_613	dd	2.613125929752753055713286	; 2*(cos(PI*1/8)+cos(PI*3/8))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define tmp		ebp-SIZEOF_FP64	; double tmp
+%define workspace	tmp-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+%define rndint_magic	workspace-SIZEOF_FP32
+					; float rndint_magic = 100663296.0F
+%define gotptr		rndint_magic-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_idct_float)
+
+EXTN(jpeg_idct_float):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	FP32 0x4CC00000		; (float)(0x00C00000 << 3)
+	pushpic	eax			; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	fst	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	; -- Even part
+
+	fild	JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+
+	fxch	st0,st3
+
+	fmul	FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st2
+	fmul	FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+	fmul	FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st3
+	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+
+	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st3,st0
+
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+
+	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
+	fsubr	st0,st2
+	fxch	st0,st4
+	faddp	st2,st0
+
+	fsub	st0,st2
+
+	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
+	fsub	st0,st3
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	; -- Odd part
+
+	fild	JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+
+	fxch	st0,st3
+
+	fmul	FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st2
+	fmul	FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+	fmul	FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st6
+	fxch	st3,st0
+	fmul	FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st5
+	fstp	FP64 [tmp]
+
+	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st2,st0
+	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
+	fsubr	st0,st5
+	fxch	st0,st6
+	faddp	st5,st0
+
+	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
+	fsub	st0,st5
+	fxch	st0,st5
+	faddp	st2,st0
+
+	fld	st5
+	fadd	st0,st1
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
+	fxch	st0,st6
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
+	fxch	st0,st6
+	fsubr	st1,st0
+	fsubp	st6,st0
+
+	; -- Final output stage
+
+	fsub	st0,st1
+	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st3
+	faddp	st2,st0
+	fsub	st4,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fxch	st0,st2
+
+	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
+
+	fadd	st1,st0
+	fld	FP64 [tmp]
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+	fld	st0	; st0 = st0 + st2, st2 = st0 - st2
+	fsub	st0,st3
+	fxch	st0,st3
+	faddp	st1,st0
+
+	fxch	st0,st3
+
+	fstp	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_FLOAT_MULT_TYPE
+	add	edi, byte SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, POINTER [jdstruct_sample_range_limit(edx)]
+	sub	edx, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST_FLOAT
+	mov	eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
+	add	eax,eax			; shl eax,1 (shift out the sign bit)
+	jnz	short .rowDCT
+
+	mov	eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
+	mov	ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
+	or	eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
+	or	ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
+	or	eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
+	or	ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
+	or	eax,ebx
+	add	eax,eax			; shl eax,1 (shift out the sign bit)
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	push	eax
+
+	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
+	fadd	FP32 [rndint_magic]
+	fstp	FP32 [esp]
+
+	pop	eax
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	; -- Even part
+
+	fld	FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
+
+	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st3,st0
+
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+
+	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
+	fsubr	st0,st2
+	fxch	st0,st4
+	faddp	st2,st0
+
+	fsub	st0,st2
+
+	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
+	fsub	st0,st3
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	; -- Odd part
+
+	fld	FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st3
+	fld	FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st5
+	fstp	FP64 [tmp]
+
+	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st2,st0
+	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
+	fsubr	st0,st5
+	fxch	st0,st6
+	faddp	st5,st0
+
+	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
+	fsub	st0,st5
+	fxch	st0,st5
+	faddp	st2,st0
+
+	fld	st5
+	fadd	st0,st1
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
+	fxch	st0,st6
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
+	fxch	st0,st6
+	fsubr	st1,st0
+	fsubp	st6,st0
+
+	; -- Final output stage
+
+	sub	esp, byte DCTSIZE*SIZEOF_FP32
+
+	fsub	st0,st1
+	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st3
+	faddp	st2,st0
+	fsub	st4,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fld	FP32 [rndint_magic]
+
+	fadd	st4,st0
+	fadd	st1,st0
+	fadd	st2,st0
+	fadd	st3,st0
+
+	fxch	st0,st4
+
+	fstp	FP32 [esp+6*SIZEOF_FP32]
+	fstp	FP32 [esp+1*SIZEOF_FP32]
+	fstp	FP32 [esp+0*SIZEOF_FP32]
+	fstp	FP32 [esp+7*SIZEOF_FP32]
+
+	fxch	st0,st1
+
+	fadd	st2,st0
+	fld	FP64 [tmp]
+	fld	st1	; st4 = st4 + st1, st1 = st4 - st1
+	fsubr	st0,st5
+	fxch	st0,st2
+	faddp	st5,st0
+	fld	st0	; st0 = st0 + st3, st3 = st0 - st3
+	fsub	st0,st4
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fxch	st0,st2
+
+	fadd	st1,st0
+	fadd	st2,st0
+	fadd	st3,st0
+	faddp	st4,st0
+
+	fstp	FP32 [esp+5*SIZEOF_FP32]
+	fstp	FP32 [esp+4*SIZEOF_FP32]
+	fstp	FP32 [esp+3*SIZEOF_FP32]
+	fstp	FP32 [esp+2*SIZEOF_FP32]
+
+%assign i 0	; i=0;
+%rep 4	; -- repeat 4 times ---
+	pop	eax
+	pop	ebx
+	and	eax,RANGE_MASK
+	and	ebx,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl
+%assign i i+2	; i+=2;
+%endrep	; -- repeat end ---
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	add	edi, byte SIZEOF_JSAMPROW	; advance pointer to next row
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jidctfst.asm
+++ b/jidctfst.asm
@@ -0,0 +1,464 @@
+;
+; jidctfst.asm - fast integer IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; We can gain a little more speed, with a further compromise in accuracy,
+; by omitting the addition in a descaling shift.  This yields an
+; incorrectly rounded result half the time...
+;
+%macro	descale 2
+%ifdef USE_ACCURATE_ROUNDING
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER		; JSAMPLE * range_limit
+%define ptr		range_limit-SIZEOF_POINTER	; void * ptr
+%define workspace	ptr-DCTSIZE2*SIZEOF_INT
+					; int workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_ifast)
+
+EXTN(jpeg_idct_ifast):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	cwde
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(5,edi,SIZEOF_INT)], eax
+	mov	INT [COL(6,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+
+	mov	POINTER [ptr], edi	; wsptr
+
+	; -- Even part
+
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	cx, IFAST_MULT_TYPE [COL(4,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	imul	bx, IFAST_MULT_TYPE [COL(2,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	di, IFAST_MULT_TYPE [COL(6,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	lea	edx,[eax+ecx]		; edx=tmp10
+	sub	eax,ecx			; eax=tmp11
+
+	lea	ecx,[ebx+edi]		; ecx=tmp13
+	sub	ebx,edi
+	imul	ebx,(F_1_414)
+	descale	ebx,CONST_BITS
+	sub	ebx,ecx			; ebx=tmp12
+
+	lea	edi,[edx+ecx]		; edi=tmp0
+	sub	edx,ecx			; edx=tmp3
+	lea	ecx,[eax+ebx]		; ecx=tmp1
+	sub	eax,ebx			; eax=tmp2
+
+	push	edx		; tmp3
+	push	eax		; tmp2
+	push	ecx		; tmp1
+	push	edi		; tmp0
+
+	; -- Odd part
+
+	mov	edx, POINTER [esp+16]	; quantptr
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(1,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	bx, IFAST_MULT_TYPE [COL(7,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movsx	edi, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	di, IFAST_MULT_TYPE [COL(5,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	cx, IFAST_MULT_TYPE [COL(3,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	lea	esi,[eax+ebx]		; esi=z11
+	sub	eax,ebx			; eax=z12
+	lea	edx,[edi+ecx]		; edx=z13
+	sub	edi,ecx			; edi=z10
+
+	lea	ebx,[esi+edx]		; ebx=tmp7
+	sub	esi,edx
+	imul	esi,(F_1_414)		; esi=tmp11
+	descale	esi,CONST_BITS
+
+	lea	ecx,[edi+eax]
+	imul	ecx,(F_1_847)		; ecx=z5
+	imul	edi,(-F_2_613)		; edi=MULTIPLY(z10,-FIX_2_613125930)
+	imul	eax,(F_1_082)		; eax=MULTIPLY(z12,FIX_1_082392200)
+	descale	ecx,CONST_BITS
+	descale	edi,CONST_BITS
+	descale	eax,CONST_BITS
+	add	edi,ecx			; edi=tmp12
+	sub	eax,ecx			; eax=tmp10
+
+	; -- Final output stage
+
+	sub	edi,ebx		; edi=tmp6
+	pop	edx		; edx=tmp0
+	sub	esi,edi		; esi=tmp5
+	pop	ecx		; ecx=tmp1
+	add	eax,esi		; eax=tmp4
+	push	esi		; tmp5
+	push	eax		; tmp4
+
+	lea	eax,[edx+ebx]	; eax=data0(=tmp0+tmp7)
+	sub	edx,ebx		; edx=data7(=tmp0-tmp7)
+	lea	ebx,[ecx+edi]	; ebx=data1(=tmp1+tmp6)
+	sub	ecx,edi		; ecx=data6(=tmp1-tmp6)
+
+	mov	edi, POINTER [ptr]	; edi=wsptr
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], edx
+	mov	INT [COL(1,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(6,edi,SIZEOF_INT)], ecx
+
+	pop	esi		; esi=tmp4
+	pop	eax		; eax=tmp5
+	pop	edx		; edx=tmp2
+	pop	ecx		; ecx=tmp3
+
+	lea	ebx,[edx+eax]	; ebx=data2(=tmp2+tmp5)
+	sub	edx,eax		; edx=data5(=tmp2-tmp5)
+	lea	eax,[ecx+esi]	; eax=data4(=tmp3+tmp4)
+	sub	ecx,esi		; ecx=data3(=tmp3-tmp4)
+
+	mov	INT [COL(2,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(5,edi,SIZEOF_INT)], edx
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], ecx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_IFAST_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	eax, INT [ROW(4,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+
+	mov	POINTER [ptr], edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(4,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(6,esi,SIZEOF_INT)]
+
+	lea	edx,[eax+ecx]		; edx=tmp10
+	sub	eax,ecx			; eax=tmp11
+
+	lea	ecx,[ebx+edi]		; ecx=tmp13
+	sub	ebx,edi
+	imul	ebx,(F_1_414)
+	descale	ebx,CONST_BITS
+	sub	ebx,ecx			; ebx=tmp12
+
+	lea	edi,[edx+ecx]		; edi=tmp0
+	sub	edx,ecx			; edx=tmp3
+	lea	ecx,[eax+ebx]		; ecx=tmp1
+	sub	eax,ebx			; eax=tmp2
+
+	push	edx		; tmp3
+	push	eax		; tmp2
+	push	ecx		; tmp1
+	push	edi		; tmp0
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	lea	esi,[eax+ebx]		; esi=z11
+	sub	eax,ebx			; eax=z12
+	lea	edx,[edi+ecx]		; edx=z13
+	sub	edi,ecx			; edi=z10
+
+	lea	ebx,[esi+edx]		; ebx=tmp7
+	sub	esi,edx
+	imul	esi,(F_1_414)		; esi=tmp11
+	descale	esi,CONST_BITS
+
+	lea	ecx,[edi+eax]
+	imul	ecx,(F_1_847)		; ecx=z5
+	imul	edi,(-F_2_613)		; edi=MULTIPLY(z10,-FIX_2_613125930)
+	imul	eax,(F_1_082)		; eax=MULTIPLY(z12,FIX_1_082392200)
+	descale	ecx,CONST_BITS
+	descale	edi,CONST_BITS
+	descale	eax,CONST_BITS
+	add	edi,ecx			; edi=tmp12
+	sub	eax,ecx			; eax=tmp10
+
+	; -- Final output stage
+
+	sub	edi,ebx		; edi=tmp6
+	pop	edx		; edx=tmp0
+	sub	esi,edi		; esi=tmp5
+	pop	ecx		; ecx=tmp1
+	add	eax,esi		; eax=tmp4
+	push	esi		; tmp5
+	push	eax		; tmp4
+
+	lea	eax,[edx+ebx]	; eax=data0(=tmp0+tmp7)
+	sub	edx,ebx		; edx=data7(=tmp0-tmp7)
+	lea	ebx,[ecx+edi]	; ebx=data1(=tmp1+tmp6)
+	sub	ecx,edi		; ecx=data6(=tmp1-tmp6)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	descale	edx,(PASS1_BITS+3)
+	descale	ebx,(PASS1_BITS+3)
+	descale	ecx,(PASS1_BITS+3)
+
+	mov	edi, POINTER [ptr]		; edi=outptr
+
+	and	eax,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ebx,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], cl
+
+	pop	esi		; esi=tmp4
+	pop	eax		; eax=tmp5
+	pop	edx		; edx=tmp2
+	pop	ecx		; ecx=tmp3
+
+	lea	ebx,[edx+eax]	; ebx=data2(=tmp2+tmp5)
+	sub	edx,eax		; edx=data5(=tmp2-tmp5)
+	lea	eax,[ecx+esi]	; eax=data4(=tmp3+tmp4)
+	sub	ecx,esi		; ecx=data3(=tmp3-tmp4)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	ebx,(PASS1_BITS+3)
+	descale	edx,(PASS1_BITS+3)
+	descale	eax,(PASS1_BITS+3)
+	descale	ecx,(PASS1_BITS+3)
+
+	and	ebx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	eax,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], cl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_IFAST_SUPPORTED
--- a/jidctint.asm
+++ b/jidctint.asm
@@ -0,0 +1,524 @@
+;
+; jidctint.asm - accurate integer IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER		; JSAMPLE * range_limit
+%define ptr		range_limit-SIZEOF_POINTER	; void * ptr
+%define workspace	ptr-DCTSIZE2*SIZEOF_INT
+					; int workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_islow)
+
+EXTN(jpeg_idct_islow):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax,PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(5,edi,SIZEOF_INT)], eax
+	mov	INT [COL(6,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+
+	mov	POINTER [ptr], edi	; wsptr
+
+	; -- Even part
+
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(4,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(2,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(6,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	lea	edx,[eax+ecx]
+	sub	eax,ecx
+	sal	edx,CONST_BITS	; edx=tmp0
+	sal	eax,CONST_BITS	; eax=tmp1
+
+	lea	ecx,[ebx+edi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	ebx,(F_0_765)	; ebx=MULTIPLY(z2,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(z3,-FIX_1_847759065)
+	add	ebx,ecx		; ebx=tmp3
+	add	edi,ecx		; edi=tmp2
+
+	lea	ecx,[edx+ebx]	; ecx=tmp10
+	sub	edx,ebx		; edx=tmp13
+	lea	ebx,[eax+edi]	; ebx=tmp11
+	sub	eax,edi		; eax=tmp12
+
+	push	edx		; tmp13
+	push	eax		; tmp12
+	push	ebx		; tmp11
+	push	ecx		; tmp10
+
+	; -- Odd part
+
+	mov	edx, POINTER [esp+16]	; quantptr
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	bx, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	push	eax		; eax=tmp3
+	push	edi		; edi=tmp2
+	push	ecx		; ecx=tmp1
+	push	ebx		; ebx=tmp0
+
+	lea	esi,[ebx+edi]	; esi=z3
+	lea	edx,[ecx+eax]	; edx=z4
+	add	ebx,eax		; ebx=z1
+	add	ecx,edi		; ecx=z2
+
+	lea	eax,[esi+edx]
+	imul	eax,(F_1_175)	; eax=z5
+
+	imul	esi,(-F_1_961)	; esi=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edx,(-F_0_390)	; edx=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	ebx,(-F_0_899)	; ebx=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ecx,(-F_2_562)	; ecx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	esi,eax		; esi=z3(=z3+z5)
+	add	edx,eax		; edx=z4(=z4+z5)
+
+	lea	edi,[esi+ebx]	; edi=z1+z3
+	lea	eax,[edx+ecx]	; eax=z2+z4
+	add	esi,ecx		; esi=z2+z3
+	add	edx,ebx		; edx=z1+z4
+
+	pop	ecx		; ecx=tmp0
+	pop	ebx		; ebx=tmp1
+	imul	ecx,(F_0_298)	; ecx=tmp0(=MULTIPLY(tmp0,FIX_0_298631336))
+	imul	ebx,(F_2_053)	; ebx=tmp1(=MULTIPLY(tmp1,FIX_2_053119869))
+	add	edi,ecx		; edi=tmp0(=tmp0+z1+z3)
+	add	eax,ebx		; eax=tmp1(=tmp1+z2+z4)
+
+	pop	ecx		; ecx=tmp2
+	pop	ebx		; ebx=tmp3
+	imul	ecx,(F_3_072)	; ecx=tmp2(=MULTIPLY(tmp2,FIX_3_072711026))
+	imul	ebx,(F_1_501)	; ebx=tmp3(=MULTIPLY(tmp3,FIX_1_501321110))
+	add	esi,ecx		; esi=tmp2(=tmp2+z2+z3)
+	add	edx,ebx		; edx=tmp3(=tmp3+z1+z4)
+
+	; -- Final output stage
+
+	pop	ecx		; ecx=tmp10
+	pop	ebx		; ebx=tmp11
+	push	eax		; tmp1
+	push	edi		; tmp0
+
+	lea	eax,[ecx+edx]	; eax=data0(=tmp10+tmp3)
+	sub	ecx,edx		; ecx=data7(=tmp10-tmp3)
+	lea	edx,[ebx+esi]	; edx=data1(=tmp11+tmp2)
+	sub	ebx,esi		; ebx=data6(=tmp11-tmp2)
+
+	mov	edi, POINTER [ptr]	; edi=wsptr
+
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	edx,(CONST_BITS-PASS1_BITS)
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], ecx
+	mov	INT [COL(1,edi,SIZEOF_INT)], edx
+	mov	INT [COL(6,edi,SIZEOF_INT)], ebx
+
+	pop	esi		; esi=tmp0
+	pop	eax		; eax=tmp1
+	pop	ecx		; ecx=tmp12
+	pop	edx		; edx=tmp13
+
+	lea	ebx,[ecx+eax]	; ebx=data2(=tmp12+tmp1)
+	sub	ecx,eax		; ecx=data5(=tmp12-tmp1)
+	lea	eax,[edx+esi]	; eax=data3(=tmp13+tmp0)
+	sub	edx,esi		; edx=data4(=tmp13-tmp0)
+
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	descale	edx,(CONST_BITS-PASS1_BITS)
+
+	mov	INT [COL(2,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(5,edi,SIZEOF_INT)], ecx
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], edx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	eax, INT [ROW(4,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+
+	mov	POINTER [ptr], edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(4,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(6,esi,SIZEOF_INT)]
+
+	lea	edx,[eax+ecx]
+	sub	eax,ecx
+	sal	edx,CONST_BITS	; edx=tmp0
+	sal	eax,CONST_BITS	; eax=tmp1
+
+	lea	ecx,[ebx+edi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	ebx,(F_0_765)	; ebx=MULTIPLY(z2,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(z3,-FIX_1_847759065)
+	add	ebx,ecx		; ebx=tmp3
+	add	edi,ecx		; edi=tmp2
+
+	lea	ecx,[edx+ebx]	; ecx=tmp10
+	sub	edx,ebx		; edx=tmp13
+	lea	ebx,[eax+edi]	; ebx=tmp11
+	sub	eax,edi		; eax=tmp12
+
+	push	edx		; tmp13
+	push	eax		; tmp12
+	push	ebx		; tmp11
+	push	ecx		; tmp10
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	push	eax		; eax=tmp3
+	push	edi		; edi=tmp2
+	push	ecx		; ecx=tmp1
+	push	ebx		; ebx=tmp0
+
+	lea	esi,[ebx+edi]	; esi=z3
+	lea	edx,[ecx+eax]	; edx=z4
+	add	ebx,eax		; ebx=z1
+	add	ecx,edi		; ecx=z2
+
+	lea	eax,[esi+edx]
+	imul	eax,(F_1_175)	; eax=z5
+
+	imul	esi,(-F_1_961)	; esi=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edx,(-F_0_390)	; edx=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	ebx,(-F_0_899)	; ebx=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ecx,(-F_2_562)	; ecx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	esi,eax		; esi=z3(=z3+z5)
+	add	edx,eax		; edx=z4(=z4+z5)
+
+	lea	edi,[esi+ebx]	; edi=z1+z3
+	lea	eax,[edx+ecx]	; eax=z2+z4
+	add	esi,ecx		; esi=z2+z3
+	add	edx,ebx		; edx=z1+z4
+
+	pop	ecx		; ecx=tmp0
+	pop	ebx		; ebx=tmp1
+	imul	ecx,(F_0_298)	; ecx=tmp0(=MULTIPLY(tmp0,FIX_0_298631336))
+	imul	ebx,(F_2_053)	; ebx=tmp1(=MULTIPLY(tmp1,FIX_2_053119869))
+	add	edi,ecx		; edi=tmp0(=tmp0+z1+z3)
+	add	eax,ebx		; eax=tmp1(=tmp1+z2+z4)
+
+	pop	ecx		; ecx=tmp2
+	pop	ebx		; ebx=tmp3
+	imul	ecx,(F_3_072)	; ecx=tmp2(=MULTIPLY(tmp2,FIX_3_072711026))
+	imul	ebx,(F_1_501)	; ebx=tmp3(=MULTIPLY(tmp3,FIX_1_501321110))
+	add	esi,ecx		; esi=tmp2(=tmp2+z2+z3)
+	add	edx,ebx		; edx=tmp3(=tmp3+z1+z4)
+
+	; -- Final output stage
+
+	pop	ecx		; ecx=tmp10
+	pop	ebx		; ebx=tmp11
+	push	eax		; tmp1
+	push	edi		; tmp0
+
+	lea	eax,[ecx+edx]	; eax=data0(=tmp10+tmp3)
+	sub	ecx,edx		; ecx=data7(=tmp10-tmp3)
+	lea	edx,[ebx+esi]	; edx=data1(=tmp11+tmp2)
+	sub	ebx,esi		; ebx=data6(=tmp11-tmp2)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(CONST_BITS+PASS1_BITS+3)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3)
+	descale	edx,(CONST_BITS+PASS1_BITS+3)
+	descale	ebx,(CONST_BITS+PASS1_BITS+3)
+
+	mov	edi, POINTER [ptr]		; edi=outptr
+
+	and	eax,RANGE_MASK
+	and	ecx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ebx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], cl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], bl
+
+	pop	esi		; esi=tmp0
+	pop	eax		; eax=tmp1
+	pop	ecx		; ecx=tmp12
+	pop	edx		; edx=tmp13
+
+	lea	ebx,[ecx+eax]	; ebx=data2(=tmp12+tmp1)
+	sub	ecx,eax		; ecx=data5(=tmp12-tmp1)
+	lea	eax,[edx+esi]	; eax=data3(=tmp13+tmp0)
+	sub	edx,esi		; edx=data4(=tmp13-tmp0)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	ebx,(CONST_BITS+PASS1_BITS+3)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3)
+	descale	eax,(CONST_BITS+PASS1_BITS+3)
+	descale	edx,(CONST_BITS+PASS1_BITS+3)
+
+	and	ebx,RANGE_MASK
+	and	ecx,RANGE_MASK
+	and	eax,RANGE_MASK
+	and	edx,RANGE_MASK
+
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], cl
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], dl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jidctred.asm
+++ b/jidctred.asm
@@ -0,0 +1,688 @@
+;
+; jidctred.asm - reduced-size IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size output:
+; either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER	; JSAMPLE * range_limit
+%define workspace	range_limit-(DCTSIZE*4)*SIZEOF_INT
+					; int workspace[DCTSIZE*4]
+
+	align	16
+	global	EXTN(jpeg_idct_4x4)
+
+EXTN(jpeg_idct_4x4):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	; Don't bother to process column 4, because second pass won't use it
+	cmp	ecx, byte DCTSIZE-4
+	je	near .nextcolumn
+
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	ax, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero; we need not examine term 4 for 4x4 output
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax, PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+	push	edi	; wsptr
+
+	; -- Even part
+
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(2,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(6,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	imul	ebx,(F_1_847)		; ebx=MULTIPLY(z2,FIX_1_847759065)
+	imul	ecx,(-F_0_765)		; ecx=MULTIPLY(z3,-FIX_0_765366865)
+	sal	eax,(CONST_BITS+1)	; eax=tmp0
+	add	ecx,ebx			; ecx=tmp2
+
+	lea	edi,[eax+ecx]		; edi=tmp10
+	sub	eax,ecx			; eax=tmp12
+
+	push	eax		; tmp12
+	push	edi		; tmp10
+
+	; -- Odd part
+
+	movsx	edi, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	imul	di, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	mov	esi,edi		; esi=edi=z1
+	mov	edx,ecx		; edx=ecx=z2
+	imul	edi,(-F_0_211)	; edi=MULTIPLY(z1,-FIX_0_211164243)
+	imul	ecx,(F_1_451)	; ecx=MULTIPLY(z2,FIX_1_451774981)
+	imul	esi,(-F_0_509)	; esi=MULTIPLY(z1,-FIX_0_509795579)
+	imul	edx,(-F_0_601)	; edx=MULTIPLY(z2,-FIX_0_601344887)
+
+	add	edi,ecx		; edi=(tmp0)
+	add	esi,edx		; esi=(tmp2)
+
+	mov	ecx,ebx		; ecx=ebx=z3
+	mov	edx,eax		; edx=eax=z4
+	imul	ebx,(-F_2_172)	; ebx=MULTIPLY(z3,-FIX_2_172734803)
+	imul	eax,(F_1_061)	; eax=MULTIPLY(z4,FIX_1_061594337)
+	imul	ecx,(F_0_899)	; ecx=MULTIPLY(z3,FIX_0_899976223)
+	imul	edx,(F_2_562)	; edx=MULTIPLY(z4,FIX_2_562915447)
+
+	add	edi,ebx
+	add	esi,ecx
+	add	edi,eax		; edi=tmp0
+	add	esi,edx		; esi=tmp2
+
+	; -- Final output stage
+
+	pop	ebx		; ebx=tmp10
+	pop	ecx		; ecx=tmp12
+
+	lea	eax,[ebx+esi]	; eax=data0(=tmp10+tmp2)
+	sub	ebx,esi		; ebx=data3(=tmp10-tmp2)
+	lea	edx,[ecx+edi]	; edx=data1(=tmp12+tmp0)
+	sub	ecx,edi		; ecx=data2(=tmp12-tmp0)
+
+	pop	edi	; wsptr
+
+	descale	eax,(CONST_BITS-PASS1_BITS+1)
+	descale	ebx,(CONST_BITS-PASS1_BITS+1)
+	descale	edx,(CONST_BITS-PASS1_BITS+1)
+	descale	ecx,(CONST_BITS-PASS1_BITS+1)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(1,edi,SIZEOF_INT)], edx
+	mov	INT [COL(2,edi,SIZEOF_INT)], ecx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process 4 rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	eax, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+	push	edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(6,esi,SIZEOF_INT)]
+
+	imul	ebx,(F_1_847)		; ebx=MULTIPLY(z2,FIX_1_847759065)
+	imul	ecx,(-F_0_765)		; ecx=MULTIPLY(z3,-FIX_0_765366865)
+	sal	eax,(CONST_BITS+1)	; eax=tmp0
+	add	ecx,ebx			; ecx=tmp2
+
+	lea	edi,[eax+ecx]		; edi=tmp10
+	sub	eax,ecx			; eax=tmp12
+
+	push	eax		; tmp12
+	push	edi		; tmp10
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(7,esi,SIZEOF_INT)]
+
+	mov	esi,edi		; esi=edi=z1
+	mov	edx,ecx		; edx=ecx=z2
+	imul	edi,(-F_0_211)	; edi=MULTIPLY(z1,-FIX_0_211164243)
+	imul	ecx,(F_1_451)	; ecx=MULTIPLY(z2,FIX_1_451774981)
+	imul	esi,(-F_0_509)	; esi=MULTIPLY(z1,-FIX_0_509795579)
+	imul	edx,(-F_0_601)	; edx=MULTIPLY(z2,-FIX_0_601344887)
+
+	add	edi,ecx		; edi=(tmp0)
+	add	esi,edx		; esi=(tmp2)
+
+	mov	ecx,ebx		; ecx=ebx=z3
+	mov	edx,eax		; edx=eax=z4
+	imul	ebx,(-F_2_172)	; ebx=MULTIPLY(z3,-FIX_2_172734803)
+	imul	eax,(F_1_061)	; eax=MULTIPLY(z4,FIX_1_061594337)
+	imul	ecx,(F_0_899)	; ecx=MULTIPLY(z3,FIX_0_899976223)
+	imul	edx,(F_2_562)	; edx=MULTIPLY(z4,FIX_2_562915447)
+
+	add	edi,ebx
+	add	esi,ecx
+	add	edi,eax		; edi=tmp0
+	add	esi,edx		; esi=tmp2
+
+	; -- Final output stage
+
+	pop	ebx		; ebx=tmp10
+	pop	ecx		; ecx=tmp12
+
+	lea	eax,[ebx+esi]	; eax=data0(=tmp10+tmp2)
+	sub	ebx,esi		; ebx=data3(=tmp10-tmp2)
+	lea	edx,[ecx+edi]	; edx=data1(=tmp12+tmp0)
+	sub	ecx,edi		; ecx=data2(=tmp12-tmp0)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(CONST_BITS+PASS1_BITS+3+1)
+	descale	ebx,(CONST_BITS+PASS1_BITS+3+1)
+	descale	edx,(CONST_BITS+PASS1_BITS+3+1)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3+1)
+
+	pop	edi	; outptr
+
+	and	eax,RANGE_MASK
+	and	ebx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], cl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER	; JSAMPLE * range_limit
+%define workspace	range_limit-(DCTSIZE*2)*SIZEOF_INT
+					; int workspace[DCTSIZE*2]
+
+	align	16
+	global	EXTN(jpeg_idct_2x2)
+
+EXTN(jpeg_idct_2x2):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	; Don't bother to process columns 2,4,6
+	test	ecx, 0x09
+	jz	near .nextcolumn
+
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	ax, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	; -- AC terms all zero; we need not examine terms 2,4,6 for 2x2 output
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax, PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	jmp	short .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	edi	; wsptr
+
+	; -- Odd part
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	bx, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	imul	eax,(F_3_624)	; eax=MULTIPLY(data1,FIX_3_624509785)
+	imul	ebx,(-F_1_272)	; ebx=MULTIPLY(data3,-FIX_1_272758580)
+	imul	ecx,(F_0_850)	; ecx=MULTIPLY(data5,FIX_0_850430095)
+	imul	edi,(-F_0_720)	; edi=MULTIPLY(data7,-FIX_0_720959822)
+
+	add	eax,ebx
+	add	ecx,edi
+	add	ecx,eax		; ecx=tmp0
+
+	; -- Even part
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax,(CONST_BITS+2)	; eax=tmp10
+
+	; -- Final output stage
+
+	pop	edi	; wsptr
+
+	lea	ebx,[eax+ecx]	; ebx=data0(=tmp10+tmp0)
+	sub	eax,ecx		; eax=data1(=tmp10-tmp0)
+
+	pop	ecx	; ctr
+
+	descale	ebx,(CONST_BITS-PASS1_BITS+2)
+	descale	eax,(CONST_BITS-PASS1_BITS+2)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process 2 rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(3,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	eax, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(7,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	jmp	short .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	ecx	; ctr
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	edx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	imul	eax,(F_3_624)	; eax=MULTIPLY(data1,FIX_3_624509785)
+	imul	ebx,(-F_1_272)	; ebx=MULTIPLY(data3,-FIX_1_272758580)
+	imul	ecx,(F_0_850)	; ecx=MULTIPLY(data5,FIX_0_850430095)
+	imul	edx,(-F_0_720)	; edx=MULTIPLY(data7,-FIX_0_720959822)
+
+	add	eax,ebx
+	add	ecx,edx
+	add	ecx,eax		; ecx=tmp0
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	sal	eax,(CONST_BITS+2)	; eax=tmp10
+
+	; -- Final output stage
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	lea	ebx,[eax+ecx]	; ebx=data0(=tmp10+tmp0)
+	sub	eax,ecx		; eax=data1(=tmp10-tmp0)
+
+	pop	ecx	; ctr
+
+	descale	ebx,(CONST_BITS+PASS1_BITS+3+2)
+	descale	eax,(CONST_BITS+PASS1_BITS+3+2)
+
+	and	ebx,RANGE_MASK
+	and	eax,RANGE_MASK
+	mov	bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 1x1 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define ebp		esp-4		; use esp instead of ebp
+
+	align	16
+	global	EXTN(jpeg_idct_1x1)
+
+EXTN(jpeg_idct_1x1):
+;	push	ebp
+;	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	; We hardly need an inverse DCT routine for this: just take the
+	; average pixel value, which is one-eighth of the DC coefficient.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	ecx, JCOEFPTR [coef_block(ebp)]		; inptr
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+
+	mov	ax, JCOEF [COL(0,ecx,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	mov	ecx, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	edx, JDIMENSION [output_col(ebp)]
+	mov	ecx, JSAMPROW [ecx]			; (JSAMPLE *)
+
+	add	ax, (1 << (3-1)) + (CENTERJSAMPLE << 3)
+	sar	ax,3		; descale
+
+	test	ah,ah		; unsigned saturation
+	jz	short .output
+	not	ax
+	sar	ax,15
+	alignx	16,3
+.output:
+	mov	JSAMPLE [ecx+edx*SIZEOF_JSAMPLE], al
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+;	pop	ebp
+	ret
+
+%endif ; IDCT_SCALING_SUPPORTED
--- a/jimmxfst.asm
+++ b/jimmxfst.asm
@@ -0,0 +1,510 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_ifast_mmx)
+
+EXTN(jpeg_idct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
+	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
+	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
+	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
+	psubw	mm4,mm0			; mm4=tmp5
+
+	movq      mm3,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
+	movq      mm0,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
+	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
+	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm7
+	movq	mm0,mm1
+	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
+	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
+	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
+	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
+
+	movq      mm4,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
+	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
+	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
+
+	movq      mm0,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(00 10 20 30)
+	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
+	movq      mm5,mm3		; transpose coefficients(phase 2)
+	punpckldq mm3,mm4		; mm3=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
+	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm7		; mm1=(40 50 60 70)
+	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
+	movq      mm0,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(42 52 62 72)
+	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
+	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
+	psraw	mm6,(PASS1_BITS+3)	; descale
+	psraw	mm7,(PASS1_BITS+3)	; descale
+	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
+	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
+	psraw	mm1,(PASS1_BITS+3)	; descale
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psubw	mm4,mm0			; mm4=tmp5
+
+	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
+	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
+
+	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm3
+	movq	mm1,mm0
+	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
+	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
+	psraw	mm3,(PASS1_BITS+3)	; descale
+	psraw	mm0,(PASS1_BITS+3)	; descale
+	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
+	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psraw	mm1,(PASS1_BITS+3)	; descale
+
+	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
+
+	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
+	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
+
+	paddb     mm6,mm4
+	paddb     mm7,mm4
+	paddb     mm3,mm4
+	paddb     mm1,mm4
+
+	movq      mm2,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm0,mm3		; transpose coefficients(phase 1)
+	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
+	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm4,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
+	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
+	movq      mm1,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
--- a/jimmxint.asm
+++ b/jimmxint.asm
@@ -0,0 +1,862 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		12
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_islow_mmx)
+
+EXTN(jpeg_idct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
+	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P1
+	psrad	mm3,DESCALE_P1
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
+	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
+
+	movq      mm6,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm1,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
+	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
+
+	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
+	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
+	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
+	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
+	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
+	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
+	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
+
+	movq	mm5,mm3
+	movq	mm6,mm0
+	paddd	mm3,mm4			; mm3=data2L
+	paddd	mm0,mm2			; mm0=data2H
+	psubd	mm5,mm4			; mm5=data5L
+	psubd	mm6,mm2			; mm6=data5H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
+
+	paddd	mm3,mm7
+	paddd	mm0,mm7
+	psrad	mm3,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm5,mm7
+	paddd	mm6,mm7
+	psrad	mm5,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
+	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
+
+	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
+	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
+	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
+	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
+
+	movq	mm0,mm1
+	movq	mm6,mm4
+	paddd	mm1,mm2			; mm1=data3L
+	paddd	mm4,mm7			; mm4=data3H
+	psubd	mm0,mm2			; mm0=data4L
+	psubd	mm6,mm7			; mm6=data4H
+
+	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
+
+	paddd	mm1,mm2
+	paddd	mm4,mm2
+	psrad	mm1,DESCALE_P1
+	psrad	mm4,DESCALE_P1
+	paddd	mm0,mm2
+	paddd	mm6,mm2
+	psrad	mm0,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
+	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
+	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
+
+	movq      mm4,mm3		; transpose coefficients(phase 1)
+	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm6,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
+	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
+
+	movq      mm1,mm7		; transpose coefficients(phase 2)
+	punpckldq mm7,mm3		; mm7=(00 10 20 30)
+	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
+	movq      mm5,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpckldq mm0,mm3		; mm0=(40 50 60 70)
+	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
+	movq      mm1,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm4		; mm6=(42 52 62 72)
+	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
+	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
+	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
+
+	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
+	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
+
+	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
+	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
+	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
+	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
+	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
+
+	movq	mm7,mm6
+	movq	mm2,mm1
+	paddd	mm6,mm3			; mm6=data2L
+	paddd	mm1,mm0			; mm1=data2H
+	psubd	mm7,mm3			; mm7=data5L
+	psubd	mm2,mm0			; mm2=data5H
+
+	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
+
+	paddd	mm6,mm5
+	paddd	mm1,mm5
+	psrad	mm6,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm7,mm5
+	paddd	mm2,mm5
+	psrad	mm7,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
+	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
+
+	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
+	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
+	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
+	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
+
+	movq	mm1,mm4
+	movq	mm2,mm3
+	paddd	mm4,mm0			; mm4=data3L
+	paddd	mm3,mm5			; mm3=data3H
+	psubd	mm1,mm0			; mm1=data4L
+	psubd	mm2,mm5			; mm2=data4H
+
+	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
+
+	paddd	mm4,mm0
+	paddd	mm3,mm0
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm1,mm0
+	paddd	mm2,mm0
+	psrad	mm1,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
+
+	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
+	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
+
+	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
+	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
+
+	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm5
+	paddb     mm3,mm5
+	paddb     mm6,mm5
+	paddb     mm4,mm5
+
+	movq      mm2,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm1,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
+	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
+	movq      mm5,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
+	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
+
+	movq      mm3,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm7		; transpose coefficients(phase 3)
+	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jimmxred.asm
+++ b/jimmxred.asm
@@ -0,0 +1,719 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
+PW_F256_F089	times 2 dw  F_2_562, F_0_899
+PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                    JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_4x4_mmx)
+
+EXTN(jpeg_idct_4x4_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm0,mm1
+	packsswb mm0,mm0
+	movd	eax,mm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P1_4
+	psrad	mm2,DESCALE_P1_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
+	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P1_4
+	psrad	mm0,DESCALE_P1_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
+	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
+
+	movq      mm6,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm7,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
+	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm2		; mm1=(00 10 20 30)
+	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
+	movq      mm3,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(02 12 22 32)
+	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P2_4
+	psrad	mm2,DESCALE_P2_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
+	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P2_4
+	psrad	mm0,DESCALE_P2_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
+	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
+	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
+	paddb     mm1,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm1		; transpose coefficients(phase 1)
+	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
+	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	psrlq	mm1,4*BYTE_BIT
+	psrlq	mm0,4*BYTE_BIT
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                    JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jpeg_idct_2x2_mmx)
+
+EXTN(jpeg_idct_2x2_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+	pcmpeqd   mm7,mm7
+	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+	movq      mm4,mm0		; mm4=(10 11 ** 13)
+	movq      mm5,mm2		; mm5=(50 51 ** 53)
+	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
+	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
+	pand	mm1,mm7			; mm1=(-- 31 -- 33)
+	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
+	pand	mm3,mm7			; mm3=(-- 71 -- 73)
+	por	mm0,mm1			; mm0=(11 31 13 33)
+	por	mm2,mm3			; mm2=(51 71 53 73)
+	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
+
+	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
+	pand	mm1,mm7			; mm1=(-- 35 -- 37)
+	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
+	pand	mm5,mm7			; mm5=(-- 75 -- 77)
+	por	mm6,mm1			; mm6=(15 35 17 37)
+	por	mm3,mm5			; mm3=(55 75 57 77)
+	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
+	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
+
+	; -- Even part
+
+	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+	movq	mm2,mm1				; mm2=(00 01 ** 03)
+	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
+	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
+
+	pand	mm2,mm7				; mm2=(-- 01 -- 03)
+	pand	mm5,mm7				; mm5=(-- 05 -- 07)
+	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
+	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
+
+	; -- Final output stage
+
+	movq      mm3,mm1
+	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
+	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
+	punpckldq mm1,mm3		; mm1=(A0 B0)
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
+
+	movq	mm4,mm2
+	movq	mm3,mm5
+	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
+	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
+	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
+	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
+
+	paddd	mm1,mm7
+	psrad	mm1,DESCALE_P1_2
+
+	paddd	mm2,mm7
+	paddd	mm5,mm7
+	psrad	mm2,DESCALE_P1_2
+	psrad	mm5,DESCALE_P1_2
+	paddd	mm4,mm7
+	paddd	mm3,mm7
+	psrad	mm4,DESCALE_P1_2
+	psrad	mm3,DESCALE_P1_2
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
+	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
+
+	; -- Even part
+
+	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
+
+	; -- Final output stage
+
+	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
+
+	movq      mm6,mm1
+	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
+	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
+
+	paddd     mm1,mm0
+	paddd     mm6,mm0
+	psrad     mm1,DESCALE_P2_2
+	psrad     mm6,DESCALE_P2_2
+
+	movq      mm7,mm1		; transpose coefficients
+	punpckldq mm1,mm6		; mm1=(C0 D0)
+	punpckhdq mm7,mm6		; mm7=(C1 D1)
+
+	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
+	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	movd	ecx,mm1
+	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
+	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; IDCT_SCALING_SUPPORTED
--- a/jiss2flt.asm
+++ b/jiss2flt.asm
@@ -0,0 +1,508 @@
+;
+; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_sse2)
+
+EXTN(jpeg_idct_float_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	xmm2, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	xmm4, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	xmm6, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	movq	xmm7, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      xmm2, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_SSE_SSE2_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jiss2fst.asm
+++ b/jiss2fst.asm
@@ -0,0 +1,512 @@
+;
+; jiss2fst.asm - fast integer IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_idct_ifast_sse2)
+
+EXTN(jpeg_idct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
+	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
+	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
--- a/jiss2int.asm
+++ b/jiss2int.asm
@@ -0,0 +1,869 @@
+;
+; jiss2int.asm - accurate integer IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jpeg_idct_islow_sse2)
+
+EXTN(jpeg_idct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
--- a/jiss2red.asm
+++ b/jiss2red.asm
@@ -0,0 +1,607 @@
+;
+; jiss2red.asm - reduced-size IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                     JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_idct_4x4_sse2)
+
+EXTN(jpeg_idct_4x4_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movd	_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movd	_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movd	_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                     JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jpeg_idct_2x2_sse2)
+
+EXTN(jpeg_idct_2x2_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; IDCT_SCALING_SUPPORTED
--- a/jisseflt.asm
+++ b/jisseflt.asm
@@ -0,0 +1,582 @@
+;
+; jisseflt.asm - floating-point IDCT (SSE & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_0_125	times 4 dd  0.125	; 1/8
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_sse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_sse)
+
+EXTN(jpeg_idct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
+	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
+
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
+	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
+
+	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
+	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
+	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
+	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
+	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
+	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
+	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
+	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
+
+	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
+	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
+	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
+	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
+	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
+	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
+	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
+	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
+	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
+	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
+
+	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
+	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
+	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
+	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
+
+	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
+	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
+
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
+	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
+	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
+	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
+	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
+
+	mulps	xmm6,xmm1		; descale(1/8)
+	mulps	xmm7,xmm1		; descale(1/8)
+	mulps	xmm5,xmm1		; descale(1/8)
+	mulps	xmm0,xmm1		; descale(1/8)
+
+	movhlps   xmm3,xmm6
+	movhlps   xmm1,xmm7
+	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
+	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
+	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
+	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
+
+	movhlps   xmm6,xmm5
+	movhlps   xmm7,xmm0
+	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
+	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
+	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
+	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
+	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
+	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
+
+	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
+	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
+
+	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
+	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm5,xmm3
+	movaps	xmm0,xmm1
+	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
+	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
+	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
+
+	mulps	xmm3,xmm6		; descale(1/8)
+	mulps	xmm1,xmm6		; descale(1/8)
+	mulps	xmm5,xmm6		; descale(1/8)
+	mulps	xmm0,xmm6		; descale(1/8)
+
+	movhlps   xmm7,xmm3
+	movhlps   xmm2,xmm1
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
+	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
+	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
+	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
+	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
+
+	movhlps   xmm4,xmm5
+	movhlps   xmm6,xmm0
+	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
+	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
+	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
+	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
+	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
+	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm6
+	paddb     mm1,mm6
+	paddb     mm2,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
+	movq      mm3,mm2		; transpose coefficients(phase 1)
+	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
+	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm6,mm3		; transpose coefficients(phase 2)
+	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
+	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
+
+	movq      mm1,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_SSE_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 27, 2004
+ * ---------------------------------------------------------------------
+ *
 * This file contains the JPEG system-independent memory management
 * routines.  This code is usable across a wide variety of machines; most
 * of the system dependencies have been isolated in a separate file.
@@ -51,27 +58,12 @@ extern char * getenv JPP((const char * name));


 /*
- * Many machines require storage alignment: longs must start on 4-byte
- * boundaries, doubles on 8-byte boundaries, etc.  On such machines, malloc()
- * always returns pointers that are multiples of the worst-case alignment
- * requirement, and we had better do so too.
- * There isn't any really portable way to determine the worst-case alignment
- * requirement.  This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double.  This is necessary on some
- * workstations (where doubles really do need 8-byte alignment) and will work
- * fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
- * The only place I know of where this will NOT work is certain Macintosh
- * 680x0 compilers that define double as a 10-byte IEEE extended float.
- * Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well.  Put "#define ALIGN_TYPE long" in jconfig.h if you have
- * such a compiler.
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
 */

-#ifndef ALIGN_TYPE		/* so can override from jconfig.h */
-#define ALIGN_TYPE  double
-#endif
+#define ALIGN_SIZE  16		/* sizeof SSE/SSE2 register */


 /*
@@ -81,31 +73,24 @@ extern char * getenv JPP((const char * name));
 * header with a link to the next pool of the same class.
 * Small and large pool headers are identical except that the latter's
 * link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field.  This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
 */

-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;

-typedef union small_pool_struct {
-  struct {
-    small_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct small_pool_struct {
+  small_pool_ptr next;		/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
+  char dummy[ALIGN_SIZE-1];
 } small_pool_hdr;

-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;

-typedef union large_pool_struct {
-  struct {
-    large_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct large_pool_struct {
+  large_pool_ptr next;		/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
+  char dummy[ALIGN_SIZE-1];
 } large_pool_hdr;


@@ -197,16 +182,16 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 	  pool_id, mem->total_space_allocated);

  for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
-       lhdr_ptr = lhdr_ptr->hdr.next) {
+       lhdr_ptr = lhdr_ptr->next) {
    fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->hdr.bytes_used);
+	    (long) lhdr_ptr->bytes_used);
  }

  for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
-       shdr_ptr = shdr_ptr->hdr.next) {
+       shdr_ptr = shdr_ptr->next) {
    fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->hdr.bytes_used,
-	    (long) shdr_ptr->hdr.bytes_left);
+	    (long) shdr_ptr->bytes_used,
+	    (long) shdr_ptr->bytes_left);
  }
 }

@@ -266,10 +251,10 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
    out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */

-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
+  /* Round up the requested size to a multiple of ALIGN_SIZE */
+  odd_bytes = sizeofobject % ALIGN_SIZE;
  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
+    sizeofobject += ALIGN_SIZE - odd_bytes;

  /* See if space is available in any existing pool */
  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
@@ -277,10 +262,10 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  prev_hdr_ptr = NULL;
  hdr_ptr = mem->small_list[pool_id];
  while (hdr_ptr != NULL) {
-    if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+    if (hdr_ptr->bytes_left >= sizeofobject)
      break;			/* found pool with enough space */
    prev_hdr_ptr = hdr_ptr;
-    hdr_ptr = hdr_ptr->hdr.next;
+    hdr_ptr = hdr_ptr->next;
  }

  /* Time to make a new pool? */
@@ -305,20 +290,20 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
    }
    mem->total_space_allocated += min_request + slop;
    /* Success, initialize the new pool header and add to end of list */
-    hdr_ptr->hdr.next = NULL;
-    hdr_ptr->hdr.bytes_used = 0;
-    hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+    hdr_ptr->next = NULL;
+    hdr_ptr->bytes_used = 0;
+    hdr_ptr->bytes_left = sizeofobject + slop;
    if (prev_hdr_ptr == NULL)	/* first pool in class? */
      mem->small_list[pool_id] = hdr_ptr;
    else
-      prev_hdr_ptr->hdr.next = hdr_ptr;
+      prev_hdr_ptr->next = hdr_ptr;
  }

  /* OK, allocate the object from the current pool */
-  data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
-  data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
-  hdr_ptr->hdr.bytes_used += sizeofobject;
-  hdr_ptr->hdr.bytes_left -= sizeofobject;
+  data_ptr = (char *) ((size_t) (hdr_ptr + 1) & -ALIGN_SIZE);
+  data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+  hdr_ptr->bytes_used += sizeofobject;
+  hdr_ptr->bytes_left -= sizeofobject;

  return (void *) data_ptr;
 }
@@ -350,10 +335,10 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
    out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */

-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
+  /* Round up the requested size to a multiple of ALIGN_SIZE */
+  odd_bytes = sizeofobject % ALIGN_SIZE;
  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
+    sizeofobject += ALIGN_SIZE - odd_bytes;

  /* Always make a new pool */
  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
@@ -366,15 +351,15 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);

  /* Success, initialize the new pool header and add to list */
-  hdr_ptr->hdr.next = mem->large_list[pool_id];
+  hdr_ptr->next = mem->large_list[pool_id];
  /* We maintain space counts in each pool header for statistical purposes,
   * even though they are not needed for allocation.
   */
-  hdr_ptr->hdr.bytes_used = sizeofobject;
-  hdr_ptr->hdr.bytes_left = 0;
+  hdr_ptr->bytes_used = sizeofobject;
+  hdr_ptr->bytes_left = 0;
  mem->large_list[pool_id] = hdr_ptr;

-  return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+  return (void FAR *) ((size_t) (hdr_ptr + 1) & -ALIGN_SIZE);
 }


@@ -401,6 +386,12 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
  JSAMPROW workspace;
  JDIMENSION rowsperchunk, currow, i;
  long ltemp;
+  JDIMENSION odd_samples;
+
+  /* Round up the row bytes to a multiple of ALIGN_SIZE */
+  odd_samples = samplesperrow % (ALIGN_SIZE / SIZEOF(JSAMPLE));
+  if (odd_samples > 0)
+    samplesperrow += (ALIGN_SIZE / SIZEOF(JSAMPLE)) - odd_samples;

  /* Calculate max # of rows allowed in one allocation chunk */
  ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
@@ -968,9 +959,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
  mem->large_list[pool_id] = NULL;

  while (lhdr_ptr != NULL) {
-    large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
-    space_freed = lhdr_ptr->hdr.bytes_used +
-		  lhdr_ptr->hdr.bytes_left +
+    large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+    space_freed = lhdr_ptr->bytes_used +
+		  lhdr_ptr->bytes_left +
 		  SIZEOF(large_pool_hdr);
    jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
    mem->total_space_allocated -= space_freed;
@@ -982,9 +973,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
  mem->small_list[pool_id] = NULL;

  while (shdr_ptr != NULL) {
-    small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
-    space_freed = shdr_ptr->hdr.bytes_used +
-		  shdr_ptr->hdr.bytes_left +
+    small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+    space_freed = shdr_ptr->bytes_used +
+		  shdr_ptr->bytes_left +
 		  SIZEOF(small_pool_hdr);
    jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
    mem->total_space_allocated -= space_freed;
@@ -1035,22 +1026,22 @@ jinit_memory_mgr (j_common_ptr cinfo)
  cinfo->mem = NULL;		/* for safety if init fails */

  /* Check for configuration errors.
-   * SIZEOF(ALIGN_TYPE) should be a power of 2; otherwise, it probably
+   * ALIGN_SIZE should be a power of 2; otherwise, it probably
   * doesn't reflect any real hardware alignment requirement.
   * The test is a little tricky: for X>0, X and X-1 have no one-bits
   * in common if and only if X is a power of 2, ie has only one one-bit.
   * Some compilers may give an "unreachable code" warning here; ignore it.
   */
-  if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
    ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
  /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
-   * a multiple of SIZEOF(ALIGN_TYPE).
+   * a multiple of ALIGN_SIZE.
   * Again, an "unreachable code" warning may be ignored here.
   * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
   */
  test_mac = (size_t) MAX_ALLOC_CHUNK;
  if ((long) test_mac != MAX_ALLOC_CHUNK ||
-      (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+      (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);

  max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 28, 2005
+ * ---------------------------------------------------------------------
+ *
 * This file contains additional configuration options that customize the
 * JPEG software for special applications or support machine-dependent
 * optimizations.  Most users will not need to touch this file.
@@ -20,7 +27,9 @@
 * We do not support run-time selection of data precision, sorry.
 */

-#define BITS_IN_JSAMPLE  8	/* use 8 or 12 */
+/* SIMD Ext: This SIMD code only copes with 8-bit sample values. */
+
+#define BITS_IN_JSAMPLE  8	/* SIMD Ext: cannot be changed! */


 /*
@@ -157,7 +166,8 @@ typedef short INT16;

 /* INT32 must hold at least signed 32-bit values. */

-#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+	/* X11/xmd.h and basetsd.h (Win32 SDK) correctly define INT32 */
+#if !defined(XMD_H) && !defined(_BASETSD_H_) && !defined(_BASETSD_H)
 typedef long INT32;
 #endif

@@ -180,14 +190,24 @@ typedef unsigned int JDIMENSION;
 * or code profilers that require it.
 */

+#if defined(_MSC_VER) || defined(__BORLANDC__) || \
+    defined(__WATCOMC__) || defined(__MWERKS__) || \
+    defined(__ICC) || defined(__INTEL_COMPILER)
+#define JCDECL  __cdecl
+#elif defined(__GNUC__)
+#define JCDECL  __attribute__((__cdecl__))
+#else
+#define JCDECL
+#endif
+
 /* a function called through method pointers: */
-#define METHODDEF(type)		static type
+#define METHODDEF(type)		static type JCDECL
 /* a function used only in its module: */
 #define LOCAL(type)		static type
 /* a function referenced thru EXTERNs: */
-#define GLOBAL(type)		type
+#define GLOBAL(type)		type JCDECL
 /* a reference to a GLOBAL function: */
-#define EXTERN(type)		extern type
+#define EXTERN(type)		extern type JCDECL


 /* This macro is used to declare a "method", that is, a function pointer.
@@ -197,9 +217,9 @@ typedef unsigned int JDIMENSION;
 */

 #ifdef HAVE_PROTOTYPES
-#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#define JMETHOD(type,methodname,arglist)  type (JCDECL *methodname) arglist
 #else
-#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
+#define JMETHOD(type,methodname,arglist)  type (JCDECL *methodname) ()
 #endif


@@ -209,11 +229,13 @@ typedef unsigned int JDIMENSION;
 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
 */

+#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
+#endif /* !FAR */


 /*
@@ -224,8 +246,14 @@ typedef unsigned int JDIMENSION;
 */

 #ifndef HAVE_BOOLEAN
-typedef int boolean;
+#ifdef TYPEDEF_UCHAR_BOOLEAN
+#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
 #endif
+#else /* !TYPEDEF_UCHAR_BOOLEAN */
+typedef int boolean;
+#endif /* TYPEDEF_UCHAR_BOOLEAN */
+#endif /* !HAVE_BOOLEAN */
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
 #endif
@@ -290,6 +318,7 @@ typedef int boolean;
 #define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
+#define UPSAMPLE_H1V2_SUPPORTED	    /* Fast/fancy processing for 1h2v? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
 #define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */

@@ -316,6 +345,84 @@ typedef int boolean;
 #define RGB_BLUE	2	/* Offset of Blue */
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */

+#undef RGBX_FILLER_0XFF 	/* fill dummy bytes with 0xFF in RGBX format */
+
+
+/* SIMD support options: */
+
+#ifndef JSIMD_MMX_NOT_SUPPORTED
+#define JSIMD_ENCODER_MMX_SUPPORTED	/* Use MMX    in encoding process */
+#define JSIMD_DECODER_MMX_SUPPORTED	/* Use MMX    in decoding process */
+#endif
+#ifndef JSIMD_3DNOW_NOT_SUPPORTED
+#define JSIMD_ENCODER_3DNOW_SUPPORTED	/* Use 3DNow! in encoding process */
+#define JSIMD_DECODER_3DNOW_SUPPORTED	/* Use 3DNow! in decoding process */
+#endif
+#ifndef JSIMD_SSE_NOT_SUPPORTED
+#define JSIMD_ENCODER_SSE_SUPPORTED	/* Use SSE    in encoding process */
+#define JSIMD_DECODER_SSE_SUPPORTED	/* Use SSE    in decoding process */
+#endif
+#ifndef JSIMD_SSE2_NOT_SUPPORTED
+#define JSIMD_ENCODER_SSE2_SUPPORTED	/* Use SSE2   in encoding process */
+#define JSIMD_DECODER_SSE2_SUPPORTED	/* Use SSE2   in decoding process */
+#endif
+
+/* (encoder part): */
+
+#undef JFDCT_INT_QUANTIZE_WITH_DIVISION /* Use general quantization method */
+
+#if defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JCCOLOR_RGBYCC_MMX_SUPPORTED	/* RGB->YCC conversion with MMX */
+#define JCSAMPLE_MMX_SUPPORTED		/* downsampling with MMX */
+#define JFDCT_INT_MMX_SUPPORTED		/* forward DCT with MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE2_SUPPORTED)
+#define JCCOLOR_RGBYCC_SSE2_SUPPORTED	/* RGB->YCC conversion with SSE2 */
+#define JCSAMPLE_SSE2_SUPPORTED		/* downsampling with SSE2 */
+#define JFDCT_INT_SSE2_SUPPORTED	/* forward DCT with SSE2 */
+#endif
+#if defined(JSIMD_ENCODER_3DNOW_SUPPORTED) && \
+    defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JFDCT_FLT_3DNOW_MMX_SUPPORTED	/* forward DCT with 3DNow!/MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE_SUPPORTED) && \
+    defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JFDCT_FLT_SSE_MMX_SUPPORTED	/* forward DCT with SSE/MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE_SUPPORTED) && \
+    defined(JSIMD_ENCODER_SSE2_SUPPORTED)
+#define JFDCT_FLT_SSE_SSE2_SUPPORTED	/* forward DCT with SSE/SSE2 */
+#endif
+
+/* (decoder part): */
+
+#if defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JDCOLOR_YCCRGB_MMX_SUPPORTED	/* YCC->RGB conversion with MMX */
+#define JDMERGE_MMX_SUPPORTED		/* merged upsampling with MMX */
+#define JDSAMPLE_FANCY_MMX_SUPPORTED	/* fancy upsampling with MMX */
+#define JDSAMPLE_SIMPLE_MMX_SUPPORTED	/* sloppy upsampling with MMX */
+#define JIDCT_INT_MMX_SUPPORTED		/* inverse DCT with MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE2_SUPPORTED)
+#define JDCOLOR_YCCRGB_SSE2_SUPPORTED	/* YCC->RGB conversion with SSE2 */
+#define JDMERGE_SSE2_SUPPORTED		/* merged upsampling with SSE2 */
+#define JDSAMPLE_FANCY_SSE2_SUPPORTED	/* fancy upsampling with SSE2 */
+#define JDSAMPLE_SIMPLE_SSE2_SUPPORTED	/* sloppy upsampling with SSE2 */
+#define JIDCT_INT_SSE2_SUPPORTED	/* inverse DCT with SSE2 */
+#endif
+#if defined(JSIMD_DECODER_3DNOW_SUPPORTED) && \
+    defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JIDCT_FLT_3DNOW_MMX_SUPPORTED	/* inverse DCT with 3DNow!/MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE_SUPPORTED) && \
+    defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JIDCT_FLT_SSE_MMX_SUPPORTED	/* inverse DCT with SSE/MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE_SUPPORTED) && \
+    defined(JSIMD_DECODER_SSE2_SUPPORTED)
+#define JIDCT_FLT_SSE_SSE2_SUPPORTED	/* inverse DCT with SSE/SSE2 */
+#endif
+

 /* Definitions for speed-related optimizations. */

@@ -328,6 +435,9 @@ typedef int boolean;
 #ifdef __GNUC__			/* for instance, GNU C knows about inline */
 #define INLINE __inline__
 #endif
+#ifdef _MSC_VER
+#define INLINE __inline
+#endif
 #ifndef INLINE
 #define INLINE			/* default is to define it as empty */
 #endif
--- a/jpegdll.def
+++ b/jpegdll.def
@@ -0,0 +1,73 @@
+;
+; jpegdll.def - module definition file for Win32 DLL
+;
+
+; sed -e "/\(jinit\|jpeg_simd_\(cpu\|os\|merged\)\)/d" -e "s/^EXTERN(..*) \([_A-Za-z][_A-Za-z0-9]*\).*/  \1/p" -e d jpeglib.h jpegint.h
+
+EXPORTS
+  ; API functions in jpeglib.h, which are intended
+  ; to be called by the user applications.
+  jpeg_std_error
+  jpeg_CreateCompress
+  jpeg_CreateDecompress
+  jpeg_destroy_compress
+  jpeg_destroy_decompress
+  jpeg_stdio_dest
+  jpeg_stdio_src
+  jpeg_set_defaults
+  jpeg_set_colorspace
+  jpeg_default_colorspace
+  jpeg_set_quality
+  jpeg_set_linear_quality
+  jpeg_add_quant_table
+  jpeg_quality_scaling
+  jpeg_simple_progression
+  jpeg_suppress_tables
+  jpeg_alloc_quant_table
+  jpeg_alloc_huff_table
+  jpeg_start_compress
+  jpeg_write_scanlines
+  jpeg_finish_compress
+  jpeg_write_raw_data
+  jpeg_write_marker
+  jpeg_write_m_header
+  jpeg_write_m_byte
+  jpeg_write_tables
+  jpeg_read_header
+  jpeg_start_decompress
+  jpeg_read_scanlines
+  jpeg_finish_decompress
+  jpeg_read_raw_data
+  jpeg_has_multiple_scans
+  jpeg_start_output
+  jpeg_finish_output
+  jpeg_input_complete
+  jpeg_new_colormap
+  jpeg_consume_input
+  jpeg_calc_output_dimensions
+  jpeg_save_markers
+  jpeg_set_marker_processor
+  jpeg_read_coefficients
+  jpeg_write_coefficients
+  jpeg_copy_critical_parameters
+  jpeg_abort_compress
+  jpeg_abort_decompress
+  jpeg_abort
+  jpeg_destroy
+  jpeg_resync_to_restart
+  ; Functions that are introduced by SIMD extension.
+  jpeg_simd_support
+  jpeg_simd_mask
+  jpeg_simd_color_converter
+  jpeg_simd_downsampler
+  jpeg_simd_forward_dct
+  jpeg_simd_color_deconverter
+  jpeg_simd_upsampler
+  jpeg_simd_inverse_dct
+  ; Utility functions in jutils.c.
+  ; These are needed by some applications.
+  jdiv_round_up
+  jround_up
+  jcopy_sample_rows
+  jcopy_block_row
+  jzero_far
--- a/jpegdll.rc
+++ b/jpegdll.rc
@@ -0,0 +1,57 @@
+//
+// jpegdll.rc - version information for Win32 DLL
+//
+
+// from <winver.h>
+#define VS_VERSION_INFO         1
+#define VS_FFI_FILEFLAGSMASK    0x0000003FL
+#define VS_FF_DEBUG             0x00000001L
+#define VOS__WINDOWS32          0x00000004L
+#define VFT_DLL                 0x00000002L
+#define VFT2_UNKNOWN            0x00000000L
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION    6,2,1,2
+ PRODUCTVERSION 6,2,1,2
+ FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
+#ifdef _DEBUG
+ FILEFLAGS      VS_FF_DEBUG
+#else
+ FILEFLAGS      0x00000000L
+#endif
+ FILEOS         VOS__WINDOWS32
+ FILETYPE       VFT_DLL
+ FILESUBTYPE    VFT2_UNKNOWN
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "00000000"
+        BEGIN
+            VALUE "LegalCopyright",  "Copyright (C) 1991-1998 Thomas G. Lane\0"
+            VALUE "FileDescription", "Independent JPEG Group's JPEG Library"
+                                     " with SIMD support\0"
+            VALUE "ProductName", "The Independent JPEG Group's JPEG software"
+                                 " release 6b   with x86 SIMD extension for"
+                                 " IJG JPEG library version 1.02\0"
+            VALUE "Comments", "This is not an official binary from IJG.   "
+                              "The SIMD code in this DLL is copyright (C)"
+                              " 1999-2006 MIYASAKA Masaru.\0"
+            VALUE "FileVersion",      "6.2.1.02\0"
+            VALUE "ProductVersion",   "6.2.1.02\0"
+            VALUE "OriginalFilename", "jpeg62.dll\0"
+            VALUE "InternalName",     "jpeg62\0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x0, 0
+    END
+END
+
+/////////////////////////////////////////////////////////////////////////////
--- a/jpegint.h
+++ b/jpegint.h
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : February 4, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file provides common declarations for the various JPEG modules.
 * These declarations are considered internal to the JPEG library; most
 * applications using the library shouldn't need to include this file.
@@ -291,6 +298,19 @@ struct jpeg_color_quantizer {
 #endif


+/* SIMD Ext: This macro checks if constants for SSE/SSE2 instructions are
+ * aligned to a 16-byte boundary. Most of SSE/SSE2 instructions require
+ * that the memory operand is aligned to a 16-byte boundary; if not,
+ * a general-protection exception (#GP) is generated.
+ */
+
+#ifdef JSIMD_NO_SSECONST_ALIGNMENT_CHECK
+#define IS_CONST_ALIGNED_16(p)	(1)
+#else
+#define IS_CONST_ALIGNED_16(p)	(((unsigned)(p) & 0x0F) == 0)
+#endif
+
+
 /* Short forms of external names for systems with brain-damaged linkers. */

 #ifdef NEED_SHORT_EXTERNAL_NAMES
@@ -327,6 +347,8 @@ struct jpeg_color_quantizer {
 #define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_simd_cpu_support	jSiCpuSupport
+#define jpeg_simd_os_support	jSiOsSupport
 #endif /* NEED_SHORT_EXTERNAL_NAMES */


@@ -382,6 +404,10 @@ extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */

+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+EXTERN(unsigned int) jpeg_simd_os_support JPP((unsigned int simd));
+
 /* Suppress undefined-structure complaints if necessary. */

 #ifdef INCOMPLETE_TYPES_BROKEN
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -5,6 +5,13 @@
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : February 4, 2006
+ * ---------------------------------------------------------------------
+ *
 * This file defines the application interface for the JPEG library.
 * Most applications using the library need only include this file,
 * and perhaps jerror.h if they want to know the exact error codes.
@@ -13,6 +20,10 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*
 * First we include the configuration files that record how this
 * installation of the JPEG library is set up.  jconfig.h can be
@@ -33,6 +44,13 @@
 #define JPEG_LIB_VERSION  62	/* Version 6b */


+/* SIMD Ext: Version ID for the SIMD extension.
+ */
+
+#define JPEG_SIMDEXT_VERSION  102	/* version 1.02 */
+#define JPEG_SIMDEXT_VER_STR  "1.02"
+
+
 /* Various constants determining the sizes of things.
 * All of these are specified by the JPEG standard, so don't change them
 * if you want to be compatible.
@@ -235,6 +253,15 @@ typedef enum {
 	JDITHER_FS		/* Floyd-Steinberg error diffusion dither */
 } J_DITHER_MODE;

+/* SIMD Ext: bitflags for jpeg_simd_support() and jpeg_simd_mask() */
+
+#define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
+#define JSIMD_3DNOW   0x02
+#define JSIMD_SSE     0x04
+#define JSIMD_SSE2    0x08
+#define JSIMD_ALL     (JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2)
+

 /* Common fields between JPEG compression and decompression master structs. */

@@ -877,6 +904,18 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_abort		jAbort
 #define jpeg_destroy		jDestroy
 #define jpeg_resync_to_restart	jResyncRestart
+#define jpeg_simd_support	jSiSupport
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+#define jpeg_simd_mask		jSiMask
+#endif
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+#define jpeg_simd_color_converter	jSiCColor
+#define jpeg_simd_downsampler		jSiDownsampler
+#define jpeg_simd_forward_dct		jSiFDCT
+#define jpeg_simd_color_deconverter	jSiDColor
+#define jpeg_simd_upsampler		jSiUpsampler
+#define jpeg_simd_inverse_dct		jSiIDCT
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 #endif /* NEED_SHORT_EXTERNAL_NAMES */


@@ -1037,6 +1076,24 @@ EXTERN(void) jpeg_destroy JPP((j_common_ptr cinfo));
 EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo,
 					    int desired));

+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_support JPP((j_common_ptr cinfo));
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_mask
+	JPP((j_common_ptr cinfo, unsigned int remove, unsigned int add));
+#endif
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_color_converter JPP((j_compress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_downsampler JPP((j_compress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_forward_dct JPP((j_compress_ptr cinfo,
+						int method));
+EXTERN(unsigned int) jpeg_simd_color_deconverter JPP((j_decompress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_upsampler JPP((j_decompress_ptr cinfo,
+					      int do_fancy));
+EXTERN(unsigned int) jpeg_simd_inverse_dct JPP((j_decompress_ptr cinfo,
+						int method));
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+

 /* These marker codes are exported since applications and data source modules
 * are likely to want to use them.
@@ -1093,4 +1150,8 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif

+#ifdef __cplusplus
+}
+#endif
+
 #endif /* JPEGLIB_H */
--- a/jsimdcpu.asm
+++ b/jsimdcpu.asm
@@ -0,0 +1,112 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : August 23, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+
+copyright:
+	db	" x86 SIMD ext for IJG lib V", JPEG_SIMDEXT_VER_STR
+	db	" Copyright 2006, MIYASAKA Masaru "
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+	align	16
+	global	EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+	push	edi
+
+	xor	edi,edi			; simd support flag
+
+	pushfd
+	pop	eax
+	mov	edx,eax
+	xor	eax, 1<<21		; flip ID bit in EFLAGS
+	push	eax
+	popfd
+	pushfd
+	pop	eax
+	xor	eax,edx
+	jz	short .return		; CPUID is not supported
+
+	; Check for MMX, SSE and SSE2 instruction support
+	xor	eax,eax
+	cpuid
+	test	eax,eax
+	jz	short .return
+
+	xor	eax,eax
+	inc	eax
+	cpuid
+	mov	eax,edx			; eax = Standard feature flags
+
+	test	eax, 1<<23		; bit23:MMX
+	jz	short .no_mmx
+	or	edi, byte JSIMD_MMX
+.no_mmx:
+	test	eax, 1<<25		; bit25:SSE
+	jz	short .no_sse
+	or	edi, byte JSIMD_SSE
+.no_sse:
+	test	eax, 1<<26		; bit26:SSE2
+	jz	short .no_sse2
+	or	edi, byte JSIMD_SSE2
+.no_sse2:
+
+	; Check for 3DNow! instruction support
+	mov	eax, 0x80000000
+	cpuid
+	cmp	eax, 0x80000000
+	jbe	short .return
+
+	mov	eax, 0x80000001
+	cpuid
+	mov	eax,edx			; eax = Extended feature flags
+
+	test	eax, 1<<31		; bit31:3DNow!(vendor independent)
+	jz	short .no_3dnow
+	or	edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+	mov	eax,edi
+
+	pop	edi
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	ret
+
--- a/jsimddjg.asm
+++ b/jsimddjg.asm
@@ -0,0 +1,130 @@
+;
+; jsimddjg.asm - SIMD instruction support check (for DJGPP V.2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : September 26, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the OS supports SIMD instructions (DJGPP V.2)
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_os_support (unsigned int simd)
+;
+
+%define EXCEPTION_ILLEGAL_INSTRUCTION	6	; vector number of #UD
+
+%define simd	ebp+8			; unsigned int simd
+%define mxcsr	ebp-4			; unsigned int mxcsr = 0x1F80
+
+	align	16
+	global	EXTN(jpeg_simd_os_support)
+
+EXTN(jpeg_simd_os_support):
+	push	ebp
+	mov	ebp,esp
+	push	dword 0x1F80		; default value of MXCSR register
+	push	ebx
+
+	push	DWORD [simd]	; simd_flags - modified from exception_handler
+
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0202	; Get Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+	push	ecx		; selector of old exception handler
+	push	edx		; offset   of old exception handler
+
+	mov	ecx,cs
+	mov	edx, exception_handler
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0203	; Set Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+
+	mov	eax, DWORD [simd]
+
+	; If floating point emulation is enabled (CR0.EM = 1),
+	; executing an MMX/3DNow! instruction generates invalid
+	; opcode exception (#UD).
+
+	push	byte (.mmx_1 - .mmx_0)		; inst_bytes
+	push	byte (JSIMD_MMX | JSIMD_3DNOW)	; test_flags
+	test	eax, DWORD [esp]
+	jz	short .mmx_1
+.mmx_0:	emms				; executing MMX instruction
+.mmx_1:	add	esp, byte 8
+
+	push	byte (.sse_1 - .sse_0)
+	push	byte (JSIMD_SSE | JSIMD_SSE2)
+	test	eax, DWORD [esp]
+	jz	short .sse_1
+.sse_0:	ldmxcsr	DWORD [mxcsr]		; executing SSE instruction
+.sse_1:	add	esp, byte 8
+
+	pop	edx		; offset   of old exception handler
+	pop	ecx		; selector of old exception handler
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0203	; Set Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+
+	pop	eax		; return simd_flags
+	and	eax, byte JSIMD_ALL
+
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; LOCAL(void) far
+; exception_handler (unsigned long error_code,
+;                    void * context_eip, unsigned short context_cs,
+;                    unsigned long context_eflags,
+;                    void * context_esp, unsigned short context_ss);
+;
+
+%define error_code	esp+12+8	; unsigned long error_code
+%define context_eip	esp+12+12	; void * context_eip
+%define context_cs	esp+12+16	; unsigned short context_cs
+%define context_eflags	esp+12+20	; unsigned long context_eflags
+%define context_esp	esp+12+24	; void * context_esp
+%define context_ss	esp+12+28	; unsigned short context_ss
+
+%define test_flags(b)	(b)+0
+%define inst_bytes(b)	(b)+4
+%define simd_flags(b)	(b)+16
+
+	align	16
+
+exception_handler:
+	push	eax
+	push	ecx
+	push	edx
+
+	mov	eax, POINTER [context_esp]
+	mov	ecx, DWORD [test_flags(eax)]
+	mov	edx, DWORD [inst_bytes(eax)]
+	not	ecx
+	add	POINTER [context_eip], edx	; next instruction
+	and	DWORD [simd_flags(eax)], ecx	; turn off flag
+
+	pop	edx
+	pop	ecx
+	pop	eax
+	retf
+
--- a/jsimdext.inc
+++ b/jsimdext.inc
@@ -0,0 +1,347 @@
+;
+; jsimdext.inc - common declarations
+;
+; x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%ifndef JSIMDCFG_INCLUDED	; in case jsimdcfg.inc already did
+%include "jsimdcfg.inc"		; configuration declarations
+%endif
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .rdata align=16 public use32 class=CONST
+
+%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .data  align=16 public use32 class=DATA
+
+%elifdef ELF	; ----(nasm -felf -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
+%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
+%define EXTN(name)  name			; foo() -> foo
+
+%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
+
+%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
+%define SEG_CONST   .rodata align=16
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
+
+%else		; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+%endif	; ----------------------------------------------
+
+; ==========================================================================
+
+; ---- jpeglib.h -----------------------------------------------------------
+
+%define DCTSIZE		8	; The basic DCT block is 8x8 samples
+%define DCTSIZE2	64	; DCTSIZE squared; # of elements in a block
+
+%define JSIMD_NONE	0x00	; bitflags for jpeg_simd_*_support()
+%define JSIMD_MMX	0x01
+%define JSIMD_3DNOW	0x02
+%define JSIMD_SSE	0x04
+%define JSIMD_SSE2	0x08
+%define JSIMD_ALL	(JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2)
+
+; ---- jpegint.h -----------------------------------------------------------
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_simd_cpu_support	jSiCpuSupport
+%define jpeg_simd_os_support	jSiOsSupport
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; ---- jmorecfg.h ----------------------------------------------------------
+;
+; BITS_IN_JSAMPLE==8 (8-bit sample values) is the only valid setting
+; on this SIMD implementation.
+;
+%define BITS_IN_JSAMPLE	8	; Caution: Cannot be changed
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE		byte		; unsigned char
+%define SIZEOF_JSAMPLE	SIZEOF_BYTE	; sizeof(JSAMPLE)
+%define MAXJSAMPLE	255
+%define CENTERJSAMPLE	128
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF		word		; short
+%define SIZEOF_JCOEF	SIZEOF_WORD	; sizeof(JCOEF)
+
+; INT32 must hold at least signed 32-bit values.
+; On this SIMD implementation, this must be 'long'.
+;
+%define INT32		dword		; long
+%define SIZEOF_INT32	SIZEOF_DWORD	; sizeof(INT32)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION		dword		; unsigned int
+%define SIZEOF_JDIMENSION	SIZEOF_DWORD	; sizeof(JDIMENSION)
+
+; --------------------------------------------------------------------------
+
+%define JSAMPROW		POINTER		; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY		POINTER		; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE		POINTER		; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR		POINTER		; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW		SIZEOF_POINTER	; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY	SIZEOF_POINTER	; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE	SIZEOF_POINTER	; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR		SIZEOF_POINTER	; sizeof(JCOEFPTR)
+
+%define POINTER			dword		; general pointer type
+%define SIZEOF_POINTER		SIZEOF_DWORD	; sizeof(POINTER)
+%define POINTER_BIT		DWORD_BIT	; sizeof(POINTER)*BYTE_BIT
+
+%define INT			dword		; signed integer type
+%define SIZEOF_INT		SIZEOF_DWORD	; sizeof(INT)
+%define INT_BIT			DWORD_BIT	; sizeof(INT)*BYTE_BIT
+
+%define FP32			dword		; IEEE754 single
+%define SIZEOF_FP32		SIZEOF_DWORD	; sizeof(FP32)
+%define FP32_BIT		DWORD_BIT	; sizeof(FP32)*BYTE_BIT
+
+%define FP64			qword		; IEEE754 double
+%define SIZEOF_FP64		SIZEOF_QWORD	; sizeof(FP64)
+%define FP64_BIT		QWORD_BIT	; sizeof(FP64)*BYTE_BIT
+
+%define FP80			tword		; IEEE754 double-extended(x86)
+%define SIZEOF_FP80		SIZEOF_TWORD	; sizeof(FP80)
+%define FP80_BIT		TWORD_BIT	; sizeof(FP80)*BYTE_BIT
+
+%define MMWORD			qword		; int64  (MMX register)
+%define SIZEOF_MMWORD		SIZEOF_QWORD	; sizeof(MMWORD)
+%define MMWORD_BIT		QWORD_BIT	; sizeof(MMWORD)*BYTE_BIT
+
+%define XMMWORD			dqword		; int128 (SSE register)
+%define SIZEOF_XMMWORD		SIZEOF_DQWORD	; sizeof(XMMWORD)
+%define XMMWORD_BIT		DQWORD_BIT	; sizeof(XMMWORD)*BYTE_BIT
+
+%define SIZEOF_BYTE		1		; sizeof(BYTE)
+%define SIZEOF_WORD		2		; sizeof(WORD)
+%define SIZEOF_DWORD		4		; sizeof(DWORD)
+%define SIZEOF_QWORD		8		; sizeof(QWORD)
+%define SIZEOF_TBYTE		10		; sizeof(TBYTE)
+%define SIZEOF_TWORD		10		; sizeof(TWORD)
+%define SIZEOF_DQWORD		16		; sizeof(DQWORD)
+
+%define BYTE_BIT		8		; CHAR_BIT in C
+%define WORD_BIT		16		; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT		32		; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT		64		; sizeof(QWORD)*BYTE_BIT
+%define TBYTE_BIT		80		; sizeof(TBYTE)*BYTE_BIT
+%define TWORD_BIT		80		; sizeof(TWORD)*BYTE_BIT
+%define DQWORD_BIT		128		; sizeof(DQWORD)*BYTE_BIT
+
+%idefine TBYTE	TWORD	; NASM uses the keyword 'TWORD' instead of 'TBYTE'
+%idefine DQWORD		; currently not supported by NASM
+%idefine _MMWORD	;
+%idefine _DWORD		;
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)   _ %+ name		; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+	SECTION	SEG_CONST
+const_base:
+
+%define GOTOFF(got,sym) (got) + (sym) - const_base
+
+%imacro get_GOT	1
+	; NOTE: this macro destroys ecx resister.
+	call	%%geteip
+	add	ecx, byte (%%ref - $)
+	jmp	short %%adjust
+%%geteip:
+	mov	ecx, POINTER [esp]
+	ret
+%%adjust:
+	push	ebp
+	xor	ebp,ebp		; ebp = 0
+%ifidni %1,ebx	; (%1 == ebx)
+	; db 0x8D,0x9C + jmp near const_base =
+	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+	db	0x8D,0x9C		; 8D,9C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+	; db 0x8D,0x8C + jmp near const_base =
+	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+	db	0x8D,0x8C		; 8D,8C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:	mov	%1, ecx
+%endif ; (%1 == ebx)
+	pop	ebp
+%endmacro
+
+%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT	1
+	extern	GOT_SYMBOL
+	call	%%geteip
+	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+	jmp	short %%done
+%%geteip:
+	mov	%1, POINTER [esp]
+	ret
+%%done:
+%endmacro
+
+%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic	1.nolist
+	push	%1
+%endmacro
+%imacro poppic	1.nolist
+	pop	%1
+%endmacro
+%imacro movpic	2.nolist
+	mov	%1,%2
+%endmacro
+
+%else	; !PIC -----------------------------------------
+
+%define GOTOFF(got,sym) (sym)
+
+%imacro get_GOT	1.nolist
+%endmacro
+%imacro pushpic	1.nolist
+%endmacro
+%imacro poppic	1.nolist
+%endmacro
+%imacro movpic	2.nolist
+%endmacro
+
+%endif	;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b,n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+	       db 0x90                               ; nop
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+	       db 0x8B,0xED                          ; mov ebp,ebp
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+	       db 0x90                               ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+	align %1, db 0		; filling zeros
+%endmacro
+
+; --------------------------------------------------------------------------
--- a/jsimdgcc.c
+++ b/jsimdgcc.c
@@ -0,0 +1,95 @@
+/*
+ * jsimdgcc.c - SIMD instruction support check (gcc)
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * Last Modified : January 24, 2006
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#include <setjmp.h>
+#include <signal.h>
+
+
+static volatile int lockf /* = 0 */;
+static jmp_buf jmpbuf;
+
+
+/*
+ * Exception handler for signal()
+ */
+
+LOCAL(void)
+exception_handler (int sig)
+{
+  signal(SIGILL, SIG_DFL);
+  longjmp(jmpbuf, 1);
+}
+
+
+/*
+ * Check if the OS supports SIMD instructions
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_os_support (unsigned int simd)
+{
+#ifdef __GNUC__		/* gcc (i386) */
+  unsigned int mxcsr = 0x1F80;
+
+  /* enter critical section */
+  __asm__ __volatile__ (
+  "get_lock:                  \n\t"
+    "movl  $1,%%eax           \n\t"
+    "xchgl %0,%%eax           \n\t"	/* try to get lock */
+    "cmpl  $0,%%eax           \n\t"	/* test if successful */
+    "je    critical_section   \n"
+  "spin_loop:                 \n\t"
+  /*".byte 0xF3,0x90          \n\t"*/	/* "pause" on P4 (short delay) */
+    "cmpl  $0,%0              \n\t"	/* check if lock is free */
+    "jne   spin_loop          \n\t"
+    "jmp   get_lock           \n"
+  "critical_section:          \n\t"
+     : "=m" (lockf) : "m" (lockf) : "%eax"
+  );
+
+  /* If floating point emulation is enabled (CR0.EM = 1),
+   * executing an MMX/3DNow! instruction generates invalid
+   * opcode exception (#UD).
+   */
+  if (simd & (JSIMD_MMX | JSIMD_3DNOW)) {
+    if (!setjmp(jmpbuf)) {
+      signal(SIGILL, exception_handler);
+      __asm__ __volatile__ (
+        ".byte 0x0F,0x77"		/* emms */
+      );
+      signal(SIGILL, SIG_DFL);
+    } else {
+      simd &= ~(JSIMD_MMX | JSIMD_3DNOW);
+    }
+  }
+  if (simd & (JSIMD_SSE | JSIMD_SSE2)) {
+    if (!setjmp(jmpbuf)) {
+      signal(SIGILL, exception_handler);
+      __asm__ __volatile__ (
+        "leal  %0,%%eax        \n\t"
+        ".byte 0x0F,0xAE,0x10  \n\t"	/* ldmxcsr [eax] */
+         : : "m" (mxcsr) : "%eax"
+      );
+      signal(SIGILL, SIG_DFL);
+    } else {
+      simd &= ~(JSIMD_SSE | JSIMD_SSE2);
+    }
+  }
+
+  /* leave critical section */
+  lockf = 0;	/* release lock */
+#endif	/* __GNUC__ */
+
+  return simd;
+}
--- a/jsimdw32.asm
+++ b/jsimdw32.asm
@@ -0,0 +1,121 @@
+;
+; jsimdw32.asm - SIMD instruction support check (for Win32)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : September 26, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the OS supports SIMD instructions (Win32)
+;
+; Reference: "Win32 Exception handling for assembler programmers"
+;               http://www.jorgon.freeserve.co.uk/Except/Except.htm
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_os_support (unsigned int simd)
+;
+
+%define simd	ebp+8			; unsigned int simd
+%define mxcsr	ebp-4			; unsigned int mxcsr = 0x1F80
+
+	align	16
+	global	EXTN(jpeg_simd_os_support)
+
+EXTN(jpeg_simd_os_support):
+	push	ebp
+	mov	ebp,esp
+	push	dword 0x1F80		; default value of MXCSR register
+	push	exception_handler
+	push	POINTER [fs:0]		; prev_record_ptr
+	mov	POINTER [fs:0], esp	; this_record_ptr
+
+	mov	eax, DWORD [simd]
+	and	eax, byte JSIMD_ALL
+	xor	ecx,ecx
+	xor	edx,edx
+
+	; If floating point emulation is enabled (CR0.EM = 1),
+	; executing an MMX/3DNow! instruction generates invalid
+	; opcode exception (#UD).
+
+	mov	cl, (JSIMD_MMX | JSIMD_3DNOW)
+	mov	dl, (.mmx_1 - .mmx_0)
+	test	al,cl
+	jz	short .mmx_1
+.mmx_0:	emms				; executing MMX instruction
+.mmx_1:
+	mov	cl, (JSIMD_SSE | JSIMD_SSE2)
+	mov	dl, (.sse_1 - .sse_0)
+	test	al,cl
+	jz	short .sse_1
+.sse_0:	ldmxcsr	DWORD [mxcsr]		; executing SSE instruction
+.sse_1:
+
+	pop	POINTER [fs:0]		; prev_record_ptr
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; LOCAL(EXCEPTION_DISPOSITION)
+; exception_handler (struct _EXCEPTION_RECORD * ExceptionRecord,
+;                    void * EstablisherFrame, struct _CONTEXT * ContextRecord,
+;                    void * DispatcherContext);
+;
+
+%define ExceptionContinueExecution  0	; from <excpt.h>
+%define ExceptionContinueSearch     1	; typedef enum _EXCEPTION_DISPOSITION {
+%define ExceptionNestedException    2	;   ...
+%define ExceptionCollidedUnwind     3	; } EXCEPTION_DISPOSITION
+
+%define EXCEPTION_ILLEGAL_INSTRUCTION   0xC000001D	; from <winbase.h>
+
+%define ExceptionRecord     esp+4	; struct _EXCEPTION_RECORD *
+%define EstablisherFrame    esp+8	; void * EstablisherFrame
+%define ContextRecord       esp+12	; struct _CONTEXT * ContextRecord
+%define DispatcherContext   esp+16	; void * DispatcherContext
+
+%define ExceptionCode(b)    (b)+0	; ExceptionRecord->ExceptionCode
+%define ExceptionFlags(b)   (b)+4	; ExceptionRecord->ExceptionFlags
+%define Context_Edx(b)      (b)+168	; ContextRecord->Edx
+%define Context_Ecx(b)      (b)+172	; ContextRecord->Ecx
+%define Context_Eax(b)      (b)+176	; ContextRecord->Eax
+%define Context_Eip(b)      (b)+184	; ContextRecord->Eip
+
+	align	16
+
+exception_handler:
+	mov	edx, POINTER [ExceptionRecord]
+	mov	eax, ExceptionContinueSearch
+
+	cmp	DWORD [ExceptionFlags(edx)], byte 0
+	jne	short .return			; noncontinuable exception
+	cmp	DWORD [ExceptionCode(edx)], EXCEPTION_ILLEGAL_INSTRUCTION
+	jne	short .return			; not a #UD exception
+
+	mov	eax, POINTER [ContextRecord]
+	mov	ecx, DWORD [Context_Ecx(eax)]
+	mov	edx, DWORD [Context_Edx(eax)]
+	not	ecx
+	add	DWORD [Context_Eip(eax)], edx	; next instruction
+	and	DWORD [Context_Eax(eax)], ecx	; turn off flag
+	mov	eax, ExceptionContinueExecution
+.return:
+	ret
+
--- a/libjpeg.spec
+++ b/libjpeg.spec
@@ -0,0 +1,234 @@
+%define LIBVER 62.1.0
+Summary: A library for manipulating JPEG image format files (with SIMD support)
+Summary(ja): JPEG <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>򰷤<EFBFBD><EFBFBD>٤Υ饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD> (x86 SIMD <EFBFBD>б<EFBFBD><EFBFBD><EFBFBD>)
+Name: libjpeg
+Version: 6bx1.02
+Release: 1
+License: distributable
+Group: System Environment/Libraries
+Source0: http://cetus.sakura.ne.jp/softlab/jpeg-x86simd/sources/jpegsrc-6b-x86simd-1.02.tar.bz2
+Buildroot: %{_tmppath}/%{name}-%{version}-root
+ExclusiveArch: %{ix86}
+BuildPrereq: nasm >= 0.98.25
+
+%package devel
+Summary: Development tools for programs which will use the libjpeg library.
+Summary(ja): libjpeg <EFBFBD>饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȥ<EFBFBD><EFBFBD>ץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȯ<EFBFBD>ġ<EFBFBD><EFBFBD><EFBFBD>
+Group: Development/Libraries
+Requires: libjpeg = %{version}-%{release}
+
+%description
+The libjpeg package contains a library of functions for manipulating
+JPEG images, as well as simple client programs for accessing the
+libjpeg functions.  Libjpeg client programs include cjpeg, djpeg,
+jpegtran, rdjpgcom and wrjpgcom.  Cjpeg compresses an image file into
+JPEG format.  Djpeg decompresses a JPEG file into a regular image
+file.  Jpegtran can perform various useful transformations on JPEG
+files.  Rdjpgcom displays any text comments included in a JPEG file.
+Wrjpgcom inserts text comments into a JPEG file.
+
+The libjpeg library in this package uses SIMD instructions if available.
+On a processor that supports SIMD instructions (MMX, SSE, etc),
+it runs 2-3 times faster than the original version of libjpeg.
+
+%description -l ja
+libjpeg <EFBFBD>ѥå<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˤ<EFBFBD> JPEG <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>򰷤<EFBFBD><EFBFBD>٤<EFBFBD>ɬ<EFBFBD>פʥ饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD>
+libjpeg <EFBFBD>ؿ<EFBFBD><EFBFBD>˥<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>٤δ<EFBFBD>ñ<EFBFBD>ʥ<EFBFBD><EFBFBD>饤<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ब
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>libjpeg <EFBFBD><EFBFBD><EFBFBD>饤<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˤ<EFBFBD> cjpeg, djpeg,
+jpegtran, rdjpgcom, wrjpgcom <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>cjpeg <EFBFBD>ϲ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+JPEG <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˰<EFBFBD><EFBFBD>̤<EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>djpeg <EFBFBD><EFBFBD> JPEG <EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̾<EFBFBD><EFBFBD>β<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+Ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>jpegtran <EFBFBD><EFBFBD> JPEG <EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͡<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ѵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ܤ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>
+rdjpgcom <EFBFBD><EFBFBD> JPEG <EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˴ޤޤ<EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Υ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȥ<EFBFBD>ɽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+wrjpgcom <EFBFBD><EFBFBD> JPEG <EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˥ƥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Υ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȥ<EFBFBD><EFBFBD>ɲä<EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>
+
+<EFBFBD><EFBFBD><EFBFBD>Υѥå<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˼<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD><EFBFBD> libjpeg <EFBFBD>饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ϡ<EFBFBD>x86 SIMD <EFBFBD>б<EFBFBD><EFBFBD>ǤǤ<EFBFBD><EFBFBD><EFBFBD>
+MMX <EFBFBD><EFBFBD> SSE <EFBFBD>ʤɤ<EFBFBD> SIMD <EFBFBD>黻<EFBFBD><EFBFBD>ǽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>å<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ư<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD>ꥸ<EFBFBD>ʥ<EFBFBD><EFBFBD>Ǥ<EFBFBD> libjpeg <EFBFBD>饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ӥ<EFBFBD><EFBFBD><EFBFBD> 2<EFBFBD><EFBFBD>3<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>٤<EFBFBD>®<EFBFBD>٤<EFBFBD>ư<EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>
+
+%description devel
+The libjpeg-devel package includes the header files and static libraries
+necessary for developing programs which will manipulate JPEG files using
+the libjpeg library.
+
+If you are going to develop programs which will manipulate JPEG images,
+you should install libjpeg-devel.  You'll also need to have the libjpeg
+package installed.
+
+%description devel -l ja
+libjpeg-devel <EFBFBD>ѥå<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˤϡ<EFBFBD>libjpeg <EFBFBD>饤<EFBFBD>֥<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȥä<EFBFBD> JPEG <EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȯ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Τ<EFBFBD>ɬ<EFBFBD>פʥإå<EFBFBD><EFBFBD>ե<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƥ<EFBFBD><EFBFBD>å<EFBFBD><EFBFBD>饤<EFBFBD>֥<EFBFBD><EFBFBD>꤬
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>
+
+JPEG <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>򰷤<EFBFBD><EFBFBD>ץ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȯ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݤˤϡ<EFBFBD>libjpeg-devel <EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD>󥹥ȡ<EFBFBD><EFBFBD>뤷<EFBFBD>Ʋ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ʊ<EFBFBD><EFBFBD><EFBFBD><EFBFBD> libjpeg <EFBFBD>ѥå<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>⥤<EFBFBD>󥹥ȡ<EFBFBD><EFBFBD>뤹<EFBFBD><EFBFBD>
+ɬ<EFBFBD>פ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ޤ<EFBFBD><EFBFBD><EFBFBD>
+
+%prep
+%setup -q -n jpeg-6bx
+# suppress "libtoolize --copy --force"
+mv configure.in configure.in_
+
+%build
+%configure --enable-shared --enable-static
+
+make libdir=%{_libdir} %{?_smp_mflags}
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD make test
+
+%install
+rm -rf $RPM_BUILD_ROOT
+
+%makeinstall
+#strip -R .comment $RPM_BUILD_ROOT/usr/bin/* || :
+#/sbin/ldconfig -n $RPM_BUILD_ROOT/%{_libdir}
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%doc usage.doc wizard.doc README
+%{_libdir}/libjpeg.so.*
+%{_bindir}/*
+%{_mandir}/*/*
+
+%files devel
+%defattr(-,root,root)
+%doc libjpeg.doc coderules.doc structure.doc example.c
+%doc simd_*.txt
+%{_libdir}/*.a
+%{_libdir}/*.la
+%{_libdir}/*.so
+/usr/include/*.h
+
+%changelog
+* Sat Feb 04 2006 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.02-1
+- upgraded to 6bx1.02
+
+* Thu Jan 26 2006 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.01-1
+- upgraded to 6bx1.01
+
+* Thu Mar 24 2005 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.0-1
+- based on 6b-33 from Fedora Core 3 and modified for SIMD-extended libjpeg
+- added Japanese summary and description, which is delivered from Vine Linux
+- moved wizard.doc to main package
+
+* Thu Oct  7 2004 Matthias Clasen <mclasen@redhat.com> - 6b-33
+- Add URL.  (#134791)
+
+* Tue Jun 15 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Tue Mar 02 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Fri Feb 13 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Thu Sep 25 2003 Jeremy Katz <katzj@redhat.com> 6b-30
+- rebuild to fix gzipped file md5sums (#91211)
+
+* Tue Sep 23 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- do not set rpath
+
+* Wed Jun 04 2003 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Thu Feb 13 2003 Elliot Lee <sopwith@redhat.com> 6b-27
+- Add libjpeg-shared.patch to fix shlibs on powerpc
+
+* Tue Feb 04 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- add symlink to shared lib
+
+* Wed Jan 22 2003 Tim Powers <timp@redhat.com>
+- rebuilt
+
+* Mon Jan  6 2003 Jonathan Blandford <jrb@redhat.com>
+- add docs, #76508
+
+* Fri Dec 13 2002 Elliot Lee <sopwith@redhat.com> 6b-23
+- Merge in multilib changes
+- _smp_mflags
+
+* Tue Sep 10 2002 Than Ngo <than@redhat.com> 6b-22
+- use %%_libdir
+
+* Fri Jun 21 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Thu May 23 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Thu Jan 31 2002 Bernhard Rosenkraenzer <bero@redhat.com> 6b-19
+- Fix bug #59011
+
+* Mon Jan 28 2002 Bernhard Rosenkraenzer <bero@redhat.com> 6b-18
+- Fix bug #58982
+
+* Wed Jan 09 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Tue Jul 24 2001 Bill Nottingham <notting@redhat.com>
+- require libjpeg = %%{version}
+
+* Sun Jun 24 2001 Elliot Lee <sopwith@redhat.com>
+- Bump release + rebuild.
+
+* Mon Dec 11 2000 Than Ngo <than@redhat.com>
+- rebuilt with the fixed fileutils
+- use %%{_tmppath}
+
+* Wed Nov  8 2000 Bernhard Rosenkraenzer <bero@redhat.com>
+- fix a typo (strip -R .comment, not .comments)
+
+* Thu Jul 13 2000 Prospector <bugzilla@redhat.com>
+- automatic rebuild
+
+* Sat Jun 17 2000 Bernhard Rosenkraenzer <bero@redhat.com>
+- FHSify
+- add some C++ tweaks to the headers as suggested by bug #9822)
+
+* Wed May  5 2000 Bill Nottingham <notting@redhat.com>
+- configure tweaks for ia64; remove alpha patch (it's pointless)
+
+* Sat Feb  5 2000 Bernhard Rosenkr<6B><72>zer <bero@redhat.com>
+- rebuild to get compressed man pages
+- fix description
+- some minor tweaks to the spec file
+- add docs
+- fix build on alpha (alphaev6 stuff)
+
+* Sun Mar 21 1999 Cristian Gafton <gafton@redhat.com> 
+- auto rebuild in the new build environment (release 9)
+
+* Wed Jan 13 1999 Cristian Gafton <gafton@redhat.com>
+- patch to build on arm
+- build for glibc 2.1
+
+* Mon Oct 12 1998 Cristian Gafton <gafton@redhat.com>
+- strip binaries
+
+* Mon Aug  3 1998 Jeff Johnson <jbj@redhat.com>
+- fix buildroot problem.
+
+* Tue Jun 09 1998 Prospector System <bugs@redhat.com>
+- translations modified for de
+
+* Thu Jun 04 1998 Marc Ewing <marc@redhat.com>
+- up to release 4
+- remove patch that set (improper) soname - libjpeg now does it itself
+
+* Thu May 07 1998 Prospector System <bugs@redhat.com>
+- translations modified for de, fr, tr
+
+* Fri May 01 1998 Cristian Gafton <gafton@redhat.com>
+- fixed build on manhattan
+
+* Wed Apr 08 1998 Cristian Gafton <gafton@redhat.com>
+- upgraded to version 6b
+
+* Wed Oct 08 1997 Donnie Barnes <djb@redhat.com>
+- new package to remove jpeg stuff from libgr and put in it's own package
--- a/1512
+++ b/1512
--- a/ltmain.sh
+++ b/ltmain.sh
--- a/makecfg.c
+++ b/makecfg.c
@@ -0,0 +1,300 @@
+/*
+ * makecfg.c
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ * Last Modified : March 23, 2005
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#ifndef offsetof		/* defined in <stddef.h> */
+#define offsetof(type, mem) ((size_t) \
+		((char *)&((type *)0)->mem - (char *)(type *)0))
+#endif
+
+void
+print_structure_offset (void)
+{
+  printf("\n");
+  printf("; ---- macros for structure access -----------------------------------------\n");
+  printf("\n");
+
+  printf("; struct jpeg_compress_struct\n\n");
+  printf("%%define jcstruct_image_width(b)         ((b) + %3u) ; cinfo->image_width\n",
+	(unsigned)offsetof(struct jpeg_compress_struct, image_width));
+  printf("%%define jcstruct_max_v_samp_factor(b)   ((b) + %3u) ; cinfo->max_v_samp_factor\n",
+	(unsigned)offsetof(struct jpeg_compress_struct, max_v_samp_factor));
+  printf("\n");
+
+  printf("; struct jpeg_decompress_struct\n\n");
+  printf("%%define jdstruct_output_width(b)        ((b) + %3u) ; cinfo->output_width\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, output_width));
+  printf("%%define jdstruct_max_v_samp_factor(b)   ((b) + %3u) ; cinfo->max_v_samp_factor\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, max_v_samp_factor));
+  printf("%%define jdstruct_sample_range_limit(b)  ((b) + %3u) ; cinfo->sample_range_limit\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, sample_range_limit));
+  printf("\n");
+
+  printf("; jpeg_component_info\n\n");
+  printf("%%define jcompinfo_v_samp_factor(b)      ((b) + %2u) ; compptr->v_samp_factor\n",
+	(unsigned)offsetof(jpeg_component_info, v_samp_factor));
+  printf("%%define jcompinfo_width_in_blocks(b)    ((b) + %2u) ; compptr->width_in_blocks\n",
+	(unsigned)offsetof(jpeg_component_info, width_in_blocks));
+  printf("%%define jcompinfo_downsampled_width(b)  ((b) + %2u) ; compptr->downsampled_width\n",
+	(unsigned)offsetof(jpeg_component_info, downsampled_width));
+  printf("%%define jcompinfo_dct_table(b)          ((b) + %2u) ; compptr->dct_table\n",
+	(unsigned)offsetof(jpeg_component_info, dct_table));
+  printf("\n");
+}
+
+
+void
+print_jconfig_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jconfig.h -----------------------------------------------\n");
+  printf("\n");
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+  printf("%%define NEED_SHORT_EXTERNAL_NAMES\t; Use short forms of external names\n");
+#else
+  printf("%%undef NEED_SHORT_EXTERNAL_NAMES\t; Use short forms of external names\n");
+#endif
+  printf("\n");
+}
+
+
+void
+print_jmorecfg_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jmorecfg.h ----------------------------------------------\n");
+  printf("\n");
+
+  printf("; Capability options common to encoder and decoder:\n");
+  printf("\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  printf("%%define DCT_ISLOW_SUPPORTED\t; slow but accurate integer algorithm\n");
+#else
+  printf("%%undef DCT_ISLOW_SUPPORTED\t; slow but accurate integer algorithm\n");
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  printf("%%define DCT_IFAST_SUPPORTED\t; faster, less accurate integer method\n");
+#else
+  printf("%%undef DCT_IFAST_SUPPORTED\t; faster, less accurate integer method\n");
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  printf("%%define DCT_FLOAT_SUPPORTED\t; floating-point: accurate, fast on fast HW\n");
+#else
+  printf("%%undef DCT_FLOAT_SUPPORTED\t; floating-point: accurate, fast on fast HW\n");
+#endif
+  printf("\n");
+
+  printf("; Decoder capability options:\n");
+  printf("\n");
+#ifdef IDCT_SCALING_SUPPORTED
+  printf("%%define IDCT_SCALING_SUPPORTED\t\t; Output rescaling via IDCT?\n");
+#else
+  printf("%%undef IDCT_SCALING_SUPPORTED\t\t; Output rescaling via IDCT?\n");
+#endif
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+  printf("%%define UPSAMPLE_MERGING_SUPPORTED\t; Fast path for sloppy upsampling?\n");
+#else
+  printf("%%undef UPSAMPLE_MERGING_SUPPORTED\t; Fast path for sloppy upsampling?\n");
+#endif
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+  printf("%%define UPSAMPLE_H1V2_SUPPORTED\t\t; Fast/fancy processing for 1h2v?\n");
+#else
+  printf("%%undef UPSAMPLE_H1V2_SUPPORTED\t\t; Fast/fancy processing for 1h2v?\n");
+#endif
+  printf("\n");
+
+#if (RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4) && \
+    (RGB_RED < 0 || RGB_RED >= RGB_PIXELSIZE || RGB_GREEN < 0 || \
+     RGB_GREEN >= RGB_PIXELSIZE || RGB_BLUE < 0 || RGB_BLUE >= RGB_PIXELSIZE || \
+     RGB_RED == RGB_GREEN || RGB_GREEN == RGB_BLUE || RGB_RED == RGB_BLUE)
+#error "Incorrect RGB pixel offset."
+#endif
+  printf("; Ordering of RGB data in scanlines passed to or from the application.\n");
+  printf("\n");
+  printf("%%define RGB_RED\t\t%u\t; Offset of Red in an RGB scanline element\n", RGB_RED);
+  printf("%%define RGB_GREEN\t%u\t; Offset of Green\n", RGB_GREEN);
+  printf("%%define RGB_BLUE\t%u\t; Offset of Blue\n", RGB_BLUE);
+  printf("%%define RGB_PIXELSIZE\t%u\t; JSAMPLEs per RGB scanline element\n", RGB_PIXELSIZE);
+  printf("\n");
+#ifdef RGBX_FILLER_0XFF
+  printf("%%define RGBX_FILLER_0XFF\t; fill dummy bytes with 0xFF in RGBX format\n");
+#else
+  printf("%%undef RGBX_FILLER_0XFF\t\t; fill dummy bytes with 0xFF in RGBX format\n");
+#endif
+  printf("\n");
+
+  printf("; SIMD support options (encoder):\n");
+  printf("\n");
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+  printf("%%define JCCOLOR_RGBYCC_MMX_SUPPORTED\t; RGB->YCC conversion with MMX\n");
+#else
+  printf("%%undef JCCOLOR_RGBYCC_MMX_SUPPORTED\t; RGB->YCC conversion with MMX\n");
+#endif
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+  printf("%%define JCCOLOR_RGBYCC_SSE2_SUPPORTED\t; RGB->YCC conversion with SSE2\n");
+#else
+  printf("%%undef JCCOLOR_RGBYCC_SSE2_SUPPORTED\t; RGB->YCC conversion with SSE2\n");
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+  printf("%%define JCSAMPLE_MMX_SUPPORTED\t\t; downsampling with MMX\n");
+#else
+  printf("%%undef JCSAMPLE_MMX_SUPPORTED\t\t; downsampling with MMX\n");
+#endif
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+  printf("%%define JCSAMPLE_SSE2_SUPPORTED\t\t; downsampling with SSE2\n");
+#else
+  printf("%%undef JCSAMPLE_SSE2_SUPPORTED\t\t; downsampling with SSE2\n");
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+  printf("%%define JFDCT_INT_MMX_SUPPORTED\t\t; forward DCT with MMX\n");
+#else
+  printf("%%undef JFDCT_INT_MMX_SUPPORTED\t\t; forward DCT with MMX\n");
+#endif
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+  printf("%%define JFDCT_INT_SSE2_SUPPORTED\t; forward DCT with SSE2\n");
+#else
+  printf("%%undef JFDCT_INT_SSE2_SUPPORTED\t\t; forward DCT with SSE2\n");
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+  printf("%%define JFDCT_FLT_3DNOW_MMX_SUPPORTED\t; forward DCT with 3DNow!/MMX\n");
+#else
+  printf("%%undef JFDCT_FLT_3DNOW_MMX_SUPPORTED\t; forward DCT with 3DNow!/MMX\n");
+#endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+  printf("%%define JFDCT_FLT_SSE_MMX_SUPPORTED\t; forward DCT with SSE/MMX\n");
+#else
+  printf("%%undef JFDCT_FLT_SSE_MMX_SUPPORTED\t; forward DCT with SSE/MMX\n");
+#endif
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+  printf("%%define JFDCT_FLT_SSE_SSE2_SUPPORTED\t; forward DCT with SSE/SSE2\n");
+#else
+  printf("%%undef JFDCT_FLT_SSE_SSE2_SUPPORTED\t; forward DCT with SSE/SSE2\n");
+#endif
+#ifdef JFDCT_INT_QUANTIZE_WITH_DIVISION
+  printf("%%define JFDCT_INT_QUANTIZE_WITH_DIVISION ; Use general quantization method\n");
+#else
+  printf("%%undef JFDCT_INT_QUANTIZE_WITH_DIVISION ; Use general quantization method\n");
+#endif
+  printf("\n");
+
+  printf("; SIMD support options (decoder):\n");
+  printf("\n");
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+  printf("%%define JDCOLOR_YCCRGB_MMX_SUPPORTED\t; YCC->RGB conversion with MMX\n");
+#else
+  printf("%%undef JDCOLOR_YCCRGB_MMX_SUPPORTED\t; YCC->RGB conversion with MMX\n");
+#endif
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+  printf("%%define JDCOLOR_YCCRGB_SSE2_SUPPORTED\t; YCC->RGB conversion with SSE2\n");
+#else
+  printf("%%undef JDCOLOR_YCCRGB_SSE2_SUPPORTED\t; YCC->RGB conversion with SSE2\n");
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+  printf("%%define JDMERGE_MMX_SUPPORTED\t\t; merged upsampling with MMX\n");
+#else
+  printf("%%undef JDMERGE_MMX_SUPPORTED\t\t; merged upsampling with MMX\n");
+#endif
+#ifdef JDMERGE_SSE2_SUPPORTED
+  printf("%%define JDMERGE_SSE2_SUPPORTED\t\t; merged upsampling with SSE2\n");
+#else
+  printf("%%undef JDMERGE_SSE2_SUPPORTED\t\t; merged upsampling with SSE2\n");
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+  printf("%%define JDSAMPLE_FANCY_MMX_SUPPORTED\t; fancy upsampling with MMX\n");
+#else
+  printf("%%undef JDSAMPLE_FANCY_MMX_SUPPORTED\t; fancy upsampling with MMX\n");
+#endif
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+  printf("%%define JDSAMPLE_FANCY_SSE2_SUPPORTED\t; fancy upsampling with SSE2\n");
+#else
+  printf("%%undef JDSAMPLE_FANCY_SSE2_SUPPORTED\t; fancy upsampling with SSE2\n");
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+  printf("%%define JDSAMPLE_SIMPLE_MMX_SUPPORTED\t; sloppy upsampling with MMX\n");
+#else
+  printf("%%undef JDSAMPLE_SIMPLE_MMX_SUPPORTED\t; sloppy upsampling with MMX\n");
+#endif
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+  printf("%%define JDSAMPLE_SIMPLE_SSE2_SUPPORTED\t; sloppy upsampling with SSE2\n");
+#else
+  printf("%%undef JDSAMPLE_SIMPLE_SSE2_SUPPORTED\t; sloppy upsampling with SSE2\n");
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+  printf("%%define JIDCT_INT_MMX_SUPPORTED\t\t; inverse DCT with MMX\n");
+#else
+  printf("%%undef JIDCT_INT_MMX_SUPPORTED\t\t; inverse DCT with MMX\n");
+#endif
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+  printf("%%define JIDCT_INT_SSE2_SUPPORTED\t; inverse DCT with SSE2\n");
+#else
+  printf("%%undef JIDCT_INT_SSE2_SUPPORTED\t\t; inverse DCT with SSE2\n");
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+  printf("%%define JIDCT_FLT_3DNOW_MMX_SUPPORTED\t; inverse DCT with 3DNow!/MMX\n");
+#else
+  printf("%%undef JIDCT_FLT_3DNOW_MMX_SUPPORTED\t; inverse DCT with 3DNow!/MMX\n");
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+  printf("%%define JIDCT_FLT_SSE_MMX_SUPPORTED\t; inverse DCT with SSE/MMX\n");
+#else
+  printf("%%undef JIDCT_FLT_SSE_MMX_SUPPORTED\t; inverse DCT with SSE/MMX\n");
+#endif
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+  printf("%%define JIDCT_FLT_SSE_SSE2_SUPPORTED\t; inverse DCT with SSE/SSE2\n");
+#else
+  printf("%%undef JIDCT_FLT_SSE_SSE2_SUPPORTED\t; inverse DCT with SSE/SSE2\n");
+#endif
+  printf("\n");
+}
+
+
+void
+print_jpeglib_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jpeglib.h ----------------------------------------------\n");
+  printf("\n");
+
+  printf("; Version ID for the JPEG library.\n");
+  printf("; Might be useful for tests like \"#if JPEG_LIB_VERSION >= 60\".\n");
+  printf("\n");
+  printf("%%define JPEG_LIB_VERSION  %d\n", JPEG_LIB_VERSION);
+  printf("\n");
+  printf("; SIMD Ext: Version ID for the SIMD extension.\n");
+  printf("\n");
+  printf("%%define JPEG_SIMDEXT_VERSION  %d\n", JPEG_SIMDEXT_VERSION);
+  printf("%%define JPEG_SIMDEXT_VER_STR  \"%s\"\n", JPEG_SIMDEXT_VER_STR);
+  printf("\n");
+}
+
+
+int
+main (void)
+{
+  printf(";\n; jsimdcfg.inc --- generated by makecfg.c");
+#ifdef __DATE__
+#ifdef __TIME__
+  printf(" (%s, %s)", __DATE__, __TIME__);
+#endif
+#endif
+  printf("\n;\n\n");
+  printf("%%define JSIMDCFG_INCLUDED\t; so that jsimdcfg.inc doesn't do it again\n\n");
+
+  print_structure_offset();
+  print_jconfig_h_macro();
+  print_jmorecfg_h_macro();
+  print_jpeglib_h_macro();
+
+  exit(0);
+  return 0;			/* suppress no-return-value warnings */
+}
--- a/makefile.ansi
+++ b/makefile.ansi
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension

 # This makefile is suitable for Unix-like systems with ANSI-capable compilers.
 # If you have a non-ANSI compiler, makefile.unix is a better starting point.
@@ -13,6 +14,13 @@ CFLAGS= -O
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.

+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -felf -DELF
+
 # Link-time cc options:
 LDFLAGS= 

@@ -24,6 +32,10 @@ LDLIBS=
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= jmemnobs.o

+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -75,17 +87,23 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -125,7 +143,7 @@ jconfig.h: jconfig.doc

 clean:
 	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) core testout*
+	$(RM) jsimdcfg.inc core testout*

 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -143,10 +161,63 @@ test: cjpeg djpeg jpegtran
 	cmp testorig.jpg testoutt.jpg


+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg ./makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.asm.o:
+	$(NASM) $(NAFLAGS) -o $@ $*.asm
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -157,33 +228,33 @@ jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
--- a/makefile.bc5
+++ b/makefile.bc5
@@ -0,0 +1,320 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is suitable for Borland C++ Compiler 5.5 (win32)
+
+# Read installation instructions before saying "make" !!
+
+!ifndef srcdir
+srcdir = .
+!endif
+.path.c   = $(srcdir)
+.path.h   = $(srcdir)
+.path.asm = $(srcdir)
+.path.inc = $(srcdir);.
+.path.doc = $(srcdir)
+
+# The name of your C compiler:
+CC= bcc32
+
+# You may need to adjust these cc options:
+CFLAGS= -O2 -OS -Oc -d -ff -w-par -w-aus -w-ccc -w-rch -q -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fobj -DOBJ32
+
+# Link-time cc options:
+LDFLAGS= -tWC -q
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= noeh32.lib
+
+# Put here the object file name for the correct system-dependent memory
+# manager file. For Win32, we recommend jmemnobs.c (flat memory!)
+# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
+SYSDEPMEM= jmemnobs.obj
+SYSDEPMEMLIB= +jmemnobs.obj
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+SYSDEPSIMDCHKLIB= +jsimdw32.obj
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
+        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
+        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
+# decompression library object files
+DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
+        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
+# These objectfiles are included in libjpeg.lib
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
+        rdswitch.obj cdjpeg.obj
+DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
+        rdcolmap.obj cdjpeg.obj
+TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
+
+
+all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+libjpeg.lib: $(LIBOBJECTS)
+	- del libjpeg.lib
+	tlib libjpeg.lib /E /C @&&|
+jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj +jdatadst.obj &
+jcinit.obj +jcmaster.obj +jcmarker.obj +jcmainct.obj +jcprepct.obj &
+jccoefct.obj +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj &
+jcdctmgr.obj +jccolmmx.obj +jccolss2.obj +jcsammmx.obj +jcsamss2.obj &
+jcqntint.obj +jcqntflt.obj +jcqntmmx.obj +jcqnt3dn.obj +jcqnts2i.obj &
+jcqntsse.obj +jcqnts2f.obj +jfdctint.obj +jfdctfst.obj +jfdctflt.obj &
+jfmmxint.obj +jfmmxfst.obj +jf3dnflt.obj +jfss2int.obj +jfss2fst.obj &
+jfsseflt.obj +jdapimin.obj +jdapistd.obj +jdtrans.obj +jdatasrc.obj &
+jdmaster.obj +jdinput.obj +jdmarker.obj +jdhuff.obj +jdphuff.obj &
+jdmainct.obj +jdcoefct.obj +jdpostct.obj +jddctmgr.obj +jdsample.obj &
+jdcolor.obj +jquant1.obj +jquant2.obj +jdmerge.obj +jidctint.obj &
+jidctfst.obj +jidctred.obj +jidctflt.obj +jimmxint.obj +jimmxfst.obj &
+jimmxred.obj +ji3dnflt.obj +jiss2int.obj +jiss2fst.obj +jiss2red.obj &
+jisseflt.obj +jiss2flt.obj +jdsammmx.obj +jdsamss2.obj +jdcolmmx.obj &
+jdcolss2.obj +jdmermmx.obj +jdmerss2.obj +jcomapi.obj +jutils.obj &
+jerror.obj +jmemmgr.obj $(SYSDEPMEMLIB) +jsimdcpu.obj $(SYSDEPSIMDCHKLIB)
+|
+
+cjpeg.exe: $(COBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -ecjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -edjpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -ejpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.obj
+	$(CC) $(LDFLAGS) -erdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.obj
+	$(CC) $(LDFLAGS) -ewrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+
+# This "{}" syntax allows Borland Make to "batch" source files.
+# In this way, each run of the compiler can build many modules.
+.c.obj:
+	$(CC) $(CFLAGS) -c{ $<}
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	- del *.obj
+	- del *.tds
+	- del cjpeg.exe
+	- del djpeg.exe
+	- del jpegtran.exe
+	- del rdjpgcom.exe
+	- del wrjpgcom.exe
+	- del jsimdcfg.inc
+	- del libjpeg.lib
+	- del testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	- del testout*.*
+	djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(srcdir)\makecfg.c
+	$(CC) $(LDFLAGS) -emakecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	- del makecfg.tds
+	- del makecfg.obj
+	- del makecfg.exe
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/makefile.cfg
+++ b/makefile.cfg
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension

 # makefile.cfg is edited by configure to produce a custom Makefile.

@@ -16,8 +17,9 @@ libdir = $(exec_prefix)/lib
 includedir = $(prefix)/include
 binprefix =
 manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
+manext = .1
+mandir = $(prefix)/man
+man1dir = $(mandir)/man1

 # The name of your C compiler:
 CC= @CC@
@@ -29,6 +31,10 @@ CFLAGS= @CFLAGS@ @CPPFLAGS@ @INCLUDEFLAGS@
 # However, any special defines for ansi2knr.c may be included here:
 ANSI2KNRFLAGS= @ANSI2KNRFLAGS@

+# The executable name of NASM and its options:
+NASM= @NASM@
+NAFLAGS= @NAFLAGS@ @INCLUDEFLAGS@
+
 # Link-time cc options:
 LDFLAGS= @LDFLAGS@

@@ -37,6 +43,7 @@ LDLIBS= @LIBS@

 # If using GNU libtool, LIBTOOL references it; if not, LIBTOOL is empty.
 LIBTOOL = @LIBTOOL@
+top_builddir = .
 # $(O) expands to "lo" if using libtool, plain "o" if not.
 # Similarly, $(A) expands to "la" or "a".
 O = @O@
@@ -51,8 +58,12 @@ JPEG_LIB_VERSION = @JPEG_LIB_VERSION@
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= @MEMORYMGR@

+# OS-dependent SIMD instruction support checker
+# jsimdw32.$(O) (Win32) / jsimddjg.$(O) (DJGPP V.2) / jsimdgcc.$(O) (Unix/gcc)
+SYSDEPSIMDCHK= @SIMDCHECKER@
+
 # miscellaneous OS-dependent stuff
-SHELL= /bin/sh
+SHELL= @SHELL@
 # linker
 LN= @LN@
 # file deletion command
@@ -68,6 +79,11 @@ INSTALL= @INSTALL@
 INSTALL_PROGRAM= @INSTALL_PROGRAM@
 INSTALL_LIB= @INSTALL_LIB@
 INSTALL_DATA= @INSTALL_DATA@
+# uninstallation program
+UNINSTALL= @UNINSTALL@
+# executable suffix. under cygwin,
+# 'rm' doesn't know that executables have .exe suffix.
+EXE = @EXEEXT@

 # End of configurable options.

@@ -110,19 +126,26 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM)
+COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM) \
+        jsimdcpu.$(O) $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) jctrans.$(O) jcparam.$(O) \
        jdatadst.$(O) jcinit.$(O) jcmaster.$(O) jcmarker.$(O) jcmainct.$(O) \
        jcprepct.$(O) jccoefct.$(O) jccolor.$(O) jcsample.$(O) jchuff.$(O) \
-        jcphuff.$(O) jcdctmgr.$(O) jfdctfst.$(O) jfdctflt.$(O) \
-        jfdctint.$(O)
+        jcphuff.$(O) jcdctmgr.$(O) jccolmmx.$(O) jccolss2.$(O) jcsammmx.$(O) \
+        jcsamss2.$(O) jcqntint.$(O) jcqntflt.$(O) jcqntmmx.$(O) jcqnt3dn.$(O) \
+        jcqnts2i.$(O) jcqntsse.$(O) jcqnts2f.$(O) jfdctint.$(O) jfdctfst.$(O) \
+        jfdctflt.$(O) jfmmxint.$(O) jfmmxfst.$(O) jf3dnflt.$(O) jfss2int.$(O) \
+        jfss2fst.$(O) jfsseflt.$(O)
 # decompression library object files
 DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \
        jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \
-        jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \
-        jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
-        jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O)
+        jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) jdsample.$(O) \
+        jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O) jidctint.$(O) \
+        jidctfst.$(O) jidctred.$(O) jidctflt.$(O) jimmxint.$(O) jimmxfst.$(O) \
+        jimmxred.$(O) ji3dnflt.$(O) jiss2int.$(O) jiss2fst.$(O) jiss2red.$(O) \
+        jisseflt.$(O) jiss2flt.$(O) jdsammmx.$(O) jdsamss2.$(O) jdcolmmx.$(O) \
+        jdcolss2.$(O) jdmermmx.$(O) jdmerss2.$(O)
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -136,12 +159,19 @@ TROBJECTS= jpegtran.$(O) rdswitch.$(O) cdjpeg.$(O) transupp.$(O)
 all: @A2K_DEPS@ libjpeg.$(A) cjpeg djpeg jpegtran rdjpgcom wrjpgcom

 # Special compilation rules to support ansi2knr and libtool.
-.SUFFIXES: .lo .la
+.SUFFIXES: .lo .la .asm
+
+.asm.o:
+	$(SHELL) $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) $(srcdir)/$*.asm

 # How to compile with libtool.
@COM_LT@.c.lo:
@COM_LT@	$(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c $(srcdir)/$*.c

+@COM_LT@.asm.lo:
+@COM_LT@	$(LIBTOOL) --mode=compile @TAGCC@ $(SHELL) $(srcdir)/nasm_lt.sh \
+@COM_LT@		$(NASM) $(NAFLAGS) $(srcdir)/$*.asm
+
 # How to use ansi2knr, when not using libtool.
@COM_A2K@.c.o:
@COM_A2K@	./ansi2knr $(srcdir)/$*.c knr/$*.c
@@ -169,7 +199,7 @@ libjpeg.a: @A2K_DEPS@ $(LIBOBJECTS)
 # with libtool:
 libjpeg.la: @A2K_DEPS@ $(LIBOBJECTS)
 	$(LIBTOOL) --mode=link $(CC) -o libjpeg.la $(LIBOBJECTS) \
-		-rpath $(libdir) -version-info $(JPEG_LIB_VERSION)
+		-no-undefined -rpath $(libdir) -version-info $(JPEG_LIB_VERSION)

 # sample programs:

@@ -191,34 +221,62 @@ wrjpgcom: wrjpgcom.$(O)
 # Installation rules:

 install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom @FORCE_INSTALL_LIB@
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	-@if [ ! -d $(man1dir) ]; then mkdir -p $(man1dir); fi
 	$(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
 	$(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
 	$(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
 	$(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
 	$(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
-	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
-	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
-	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
+	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(man1dir)/$(manprefix)cjpeg$(manext)
+	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(man1dir)/$(manprefix)djpeg$(manext)
+	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(man1dir)/$(manprefix)jpegtran$(manext)
+	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(man1dir)/$(manprefix)rdjpgcom$(manext)
+	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(man1dir)/$(manprefix)wrjpgcom$(manext)

 install-lib: libjpeg.$(A) install-headers
+	-@if [ ! -d $(libdir) ]; then mkdir -p $(libdir); fi
 	$(INSTALL_LIB) libjpeg.$(A) $(libdir)/$(binprefix)libjpeg.$(A)

 install-headers: jconfig.h
+	-@if [ ! -d $(includedir) ]; then mkdir -p $(includedir); fi
 	$(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
 	$(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
 	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
 	$(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h

+# Uninstallation rules:
+
+uninstall: @UNINSTALL_LIB@
+	$(UNINSTALL) $(bindir)/$(binprefix)cjpeg$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)djpeg$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)jpegtran$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)rdjpgcom$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)wrjpgcom$(EXE)
+	$(UNINSTALL) $(man1dir)/$(manprefix)cjpeg$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)djpeg$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)jpegtran$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)rdjpgcom$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)wrjpgcom$(manext)
+
+uninstall-lib: uninstall-headers
+	$(UNINSTALL) $(libdir)/$(binprefix)libjpeg.$(A)
+
+uninstall-headers:
+	$(UNINSTALL) $(includedir)/jconfig.h
+	$(UNINSTALL) $(includedir)/jpeglib.h
+	$(UNINSTALL) $(includedir)/jmorecfg.h
+	$(UNINSTALL) $(includedir)/jerror.h
+
 clean:
-	$(RM) *.o *.lo libjpeg.a libjpeg.la
-	$(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout* config.log config.status
+	$(RM) jsimdcfg.inc *.o *.lo libjpeg.a libjpeg.la
+#	 under cygwin, libtool will create wrapper scripts without suffix.
+	$(RM) cjpeg djpeg jpegtran cjpeg$(EXE) djpeg$(EXE) jpegtran$(EXE)
+	$(RM) rdjpgcom$(EXE) wrjpgcom$(EXE) ansi2knr$(EXE) core testout*
 	$(RM) -r knr .libs _libs

 distclean: clean
-	$(RM) Makefile jconfig.h libtool config.cache
+	$(RM) Makefile jconfig.h libtool config.cache config.status config.log

 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -248,10 +306,60 @@ jconfig.h: jconfig.doc
 .PHONY: all install install-lib install-headers clean distclean test check


+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg $(srcdir)/makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) makecfg$(EXE)
+
+jsimdcpu.$(O): jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.$(O): jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.$(O): jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.$(O): jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.$(O): jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.$(O): jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.$(O): jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.$(O): jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.$(O): jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.$(O): jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.$(O): jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.$(O): jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.$(O): jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.$(O): jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.$(O): jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.$(O): jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.$(O): jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.$(O): jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.$(O): jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.$(O): jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.$(O): jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.$(O): jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.$(O): jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.$(O): jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.$(O): jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.$(O): jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.$(O): jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.$(O): jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.$(O): jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.$(O): jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.$(O): jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.$(O): jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.$(O): jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.$(O): jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.$(O): jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.$(O): jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.$(O): ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.$(O): jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.$(O): jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.$(O): jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.$(O): jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.$(O): jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.$(O): jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.$(O): jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.$(O): jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.$(O): jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.$(O): jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.$(O): jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.$(O): jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -262,33 +370,33 @@ jcomapi.$(O): jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerr
 jcparam.$(O): jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.$(O): jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.$(O): jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.$(O): jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.$(O): jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.$(O): jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.$(O): jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.$(O): jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.$(O): jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.$(O): jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.$(O): jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.$(O): jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.$(O): jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.$(O): jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.$(O): jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.$(O): jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.$(O): jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.$(O): jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.$(O): jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.$(O): jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.$(O): jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
--- a/makefile.dj
+++ b/makefile.dj
@@ -1,18 +1,34 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension

 # This makefile is for DJGPP (Delorie's GNU C port on MS-DOS), v2.0 or later.
 # Thanks to Frank J. Donahoe for this version.

 # Read installation instructions before saying "make" !!

+srcdir = .
+VPATH  = $(srcdir)
+
 # The name of your C compiler:
 CC= gcc

 # You may need to adjust these cc options:
-CFLAGS= -O2 -Wall -I.
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.

+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fcoff -DDJGPP
+
 # Link-time cc options:
 LDFLAGS= -s

@@ -24,6 +40,10 @@ LDLIBS=
 # use jmemname.o if you want to use named temp files instead of swap space.
 SYSDEPMEM= jmemnobs.o

+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimddjg.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -75,17 +95,23 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -130,29 +156,83 @@ clean:
 	$(RM) jpegtran.exe
 	$(RM) rdjpgcom.exe
 	$(RM) wrjpgcom.exe
+	$(RM) jsimdcfg.inc
 	$(RM) libjpeg.a
 	$(RM) testout*.*

 test: cjpeg.exe djpeg.exe jpegtran.exe
 	$(RM) testout*.*
-	./djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	./jpegtran -outfile testoutt.jpg testprog.jpg
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg


+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -163,33 +243,33 @@ jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
--- a/makefile.linux
+++ b/makefile.linux
@@ -0,0 +1,449 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for Linux ELF with gcc
+
+# Read installation instructions before saying "make" !!
+
+# For compiling with source and object files in different directories.
+srcdir = .
+VPATH  = $(srcdir)
+
+# Where to install the programs and man pages.
+prefix = /usr/local
+exec_prefix = ${prefix}
+bindir = $(exec_prefix)/bin
+libdir = $(exec_prefix)/lib
+includedir = $(prefix)/include
+binprefix =
+manprefix =
+manext = 1
+mandir = $(prefix)/man/man$(manext)
+
+LNNAME	= libjpeg.so
+SONAME	= libjpeg.so.62
+LIBNAME	= libjpeg.so.62.1.0
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+CFLAGS= -O2 -mcpu=i686 -march=i386 -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -felf -DELF
+
+# Link-time cc options:
+LDFLAGS= 
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.  For Unix this is usually jmemnobs.o, but you may want
+# to use jmemansi.o or jmemname.o if you have limited swap space.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= rm -f
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+# installation program
+INSTALL= install -c
+INSTALL_PROGRAM= ${INSTALL} -s
+INSTALL_SHARED = ${INSTALL}
+INSTALL_LIB=  ${INSTALL} -m 644
+INSTALL_DATA= ${INSTALL} -m 644
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# These objectfiles are included in libjpeg.so
+DLLOBJECTS= $(LIBOBJECTS:.o=.pic.o)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: static shared app
+app: cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+app-static: cjpeg-static djpeg-static jpegtran-static
+shared: $(LIBNAME)
+static: libjpeg.a
+
+libjpeg.a: $(LIBOBJECTS)
+	$(RM) libjpeg.a
+	$(AR) libjpeg.a  $(LIBOBJECTS)
+	$(AR2) libjpeg.a
+
+$(LIBNAME): $(DLLOBJECTS)
+	$(CC) -shared -Wl,-soname,$(SONAME) -o $(LIBNAME) $(DLLOBJECTS)
+
+$(SONAME): $(LIBNAME)
+	ln -sf $(LIBNAME) $(SONAME)
+
+$(LNNAME): $(SONAME)
+	ln -sf $(LIBNAME) $(LNNAME)
+
+cjpeg-static: $(COBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o cjpeg-static $(COBJECTS) libjpeg.a $(LDLIBS)
+
+djpeg-static: $(DOBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o djpeg-static $(DOBJECTS) libjpeg.a $(LDLIBS)
+
+jpegtran-static: $(TROBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o jpegtran-static $(TROBJECTS) libjpeg.a $(LDLIBS)
+
+cjpeg-shared: $(COBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o cjpeg-shared $(COBJECTS) $(LNNAME) $(LDLIBS)
+
+djpeg-shared: $(DOBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o djpeg-shared $(DOBJECTS) $(LNNAME) $(LDLIBS)
+
+jpegtran-shared: $(TROBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o jpegtran-shared $(TROBJECTS) $(LNNAME) $(LDLIBS)
+
+rdjpgcom: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
+
+wrjpgcom: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
+
+cjpeg: cjpeg-shared
+	echo '#!/bin/sh'                                       > cjpeg
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> cjpeg
+	echo exec `pwd`/cjpeg-shared '"$$@"'                  >> cjpeg
+	chmod +x cjpeg
+
+djpeg: djpeg-shared
+	echo '#!/bin/sh'                                       > djpeg
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> djpeg
+	echo exec `pwd`/djpeg-shared '"$$@"'                  >> djpeg
+	chmod +x djpeg
+
+jpegtran: jpegtran-shared
+	echo '#!/bin/sh'                                       > jpegtran
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> jpegtran
+	echo exec `pwd`/jpegtran-shared '"$$@"'               >> jpegtran
+	chmod +x jpegtran
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	$(RM) *.o libjpeg.a $(LIBNAME) $(SONAME) $(LNNAME)
+	$(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+	$(RM) cjpeg-shared djpeg-shared jpegtran-shared
+	$(RM) cjpeg-static djpeg-static jpegtran-static
+	$(RM) core testout*
+	$(RM) jsimdcfg.inc
+
+test: cjpeg djpeg jpegtran
+	$(RM) testout*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)/testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)/testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)/testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
+	cmp $(srcdir)/testimg.ppm testout.ppm
+	cmp $(srcdir)/testimg.bmp testout.bmp
+	cmp $(srcdir)/testimg.jpg testout.jpg
+	cmp $(srcdir)/testimg.ppm testoutp.ppm
+	cmp $(srcdir)/testimgp.jpg testoutp.jpg
+	cmp $(srcdir)/testorig.jpg testoutt.jpg
+
+test-static: cjpeg-static djpeg-static jpegtran-static
+	$(RM) testout*
+	./djpeg-static -dct int -ppm -outfile testout.ppm $(srcdir)/testorig.jpg
+	./djpeg-static -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)/testorig.jpg
+	./cjpeg-static -dct int -outfile testout.jpg $(srcdir)/testimg.ppm
+	./djpeg-static -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
+	./cjpeg-static -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
+	./jpegtran-static -outfile testoutt.jpg $(srcdir)/testprog.jpg
+	cmp $(srcdir)/testimg.ppm testout.ppm
+	cmp $(srcdir)/testimg.bmp testout.bmp
+	cmp $(srcdir)/testimg.jpg testout.jpg
+	cmp $(srcdir)/testimg.ppm testoutp.ppm
+	cmp $(srcdir)/testimgp.jpg testoutp.jpg
+	cmp $(srcdir)/testorig.jpg testoutt.jpg
+
+
+install: install-lib install-app install-man
+
+install-app-static: cjpeg-static djpeg-static jpegtran-static
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	$(INSTALL_PROGRAM) cjpeg-static    $(bindir)/$(binprefix)cjpeg-static
+	$(INSTALL_PROGRAM) djpeg-static    $(bindir)/$(binprefix)djpeg-static
+	$(INSTALL_PROGRAM) jpegtran-static $(bindir)/$(binprefix)jpegtran-static
+
+install-app: install-lib cjpeg-shared djpeg-shared jpegtran-shared rdjpgcom wrjpgcom
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	$(INSTALL_PROGRAM) cjpeg-shared    $(bindir)/$(binprefix)cjpeg
+	$(INSTALL_PROGRAM) djpeg-shared    $(bindir)/$(binprefix)djpeg
+	$(INSTALL_PROGRAM) jpegtran-shared $(bindir)/$(binprefix)jpegtran
+	$(INSTALL_PROGRAM) rdjpgcom        $(bindir)/$(binprefix)rdjpgcom
+	$(INSTALL_PROGRAM) wrjpgcom        $(bindir)/$(binprefix)wrjpgcom
+
+install-man: cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
+	-@if [ ! -d $(mandir) ]; then mkdir -p $(mandir); fi
+	$(INSTALL_DATA) $(srcdir)/cjpeg.1    $(mandir)/$(manprefix)cjpeg.$(manext)
+	$(INSTALL_DATA) $(srcdir)/djpeg.1    $(mandir)/$(manprefix)djpeg.$(manext)
+	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
+	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
+	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
+
+install-lib: install-headers libjpeg.a $(LIBNAME)
+	-@if [ ! -d $(libdir) ]; then mkdir -p $(libdir); fi
+	$(INSTALL_LIB)    libjpeg.a  $(libdir)/libjpeg.a
+	$(INSTALL_SHARED) $(LIBNAME) $(libdir)/$(LIBNAME)
+	(cd $(libdir); ln -sf $(LIBNAME) $(SONAME); ln -sf $(LIBNAME) $(LNNAME))
+
+install-headers: jconfig.h jpeglib.h jmorecfg.h jerror.h
+	-@if [ ! -d $(includedir) ]; then mkdir -p $(includedir); fi
+	$(INSTALL_DATA) $(srcdir)/jconfig.h  $(includedir)/jconfig.h
+	$(INSTALL_DATA) $(srcdir)/jpeglib.h  $(includedir)/jpeglib.h
+	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
+	$(INSTALL_DATA) $(srcdir)/jerror.h   $(includedir)/jerror.h
+
+uninstall: uninstall-lib uninstall-app uninstall-man
+
+uninstall-app-static:
+	$(RM) $(bindir)/$(binprefix)cjpeg-static
+	$(RM) $(bindir)/$(binprefix)djpeg-static
+	$(RM) $(bindir)/$(binprefix)jpegtran-static
+
+uninstall-app: uninstall-lib
+	$(RM) $(bindir)/$(binprefix)cjpeg
+	$(RM) $(bindir)/$(binprefix)djpeg
+	$(RM) $(bindir)/$(binprefix)jpegtran
+	$(RM) $(bindir)/$(binprefix)rdjpgcom
+	$(RM) $(bindir)/$(binprefix)wrjpgcom
+
+uninstall-man:
+	$(RM) $(mandir)/$(manprefix)cjpeg.$(manext)
+	$(RM) $(mandir)/$(manprefix)djpeg.$(manext)
+	$(RM) $(mandir)/$(manprefix)jpegtran.$(manext)
+	$(RM) $(mandir)/$(manprefix)rdjpgcom.$(manext)
+	$(RM) $(mandir)/$(manprefix)wrjpgcom.$(manext)
+
+uninstall-lib: uninstall-headers
+	$(RM) $(libdir)/libjpeg.a
+	$(RM) $(libdir)/$(LIBNAME)
+	$(RM) $(libdir)/$(SONAME)
+	$(RM) $(libdir)/$(LNNAME)
+
+uninstall-headers:
+	$(RM) $(includedir)/jconfig.h
+	$(RM) $(includedir)/jpeglib.h
+	$(RM) $(includedir)/jmorecfg.h
+	$(RM) $(includedir)/jerror.h
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg $(srcdir)/makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.SUFFIXES: .c .asm .o .pic.o
+
+%.pic.o : %.c
+	$(CC) $(CFLAGS) -fPIC -c -o $@ $<
+
+%.pic.o : %.asm
+	$(NASM) $(NAFLAGS) -DPIC -o $@ $<
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o jsimdcpu.pic.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o jsimdw32.pic.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o jsimddjg.pic.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o jccolmmx.pic.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o jccolss2.pic.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o jcsammmx.pic.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o jcsamss2.pic.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o jdcolmmx.pic.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o jdcolss2.pic.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o jdmermmx.pic.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o jdmerss2.pic.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o jdsammmx.pic.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o jdsamss2.pic.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o jcqntint.pic.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o jcqntflt.pic.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o jcqntmmx.pic.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o jcqnt3dn.pic.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o jcqnts2i.pic.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o jcqntsse.pic.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o jcqnts2f.pic.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o jfdctint.pic.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o jfdctfst.pic.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o jfdctflt.pic.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o jfmmxint.pic.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o jfmmxfst.pic.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o jf3dnflt.pic.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o jfss2int.pic.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o jfss2fst.pic.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o jfsseflt.pic.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o jidctint.pic.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o jidctfst.pic.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o jidctred.pic.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o jidctflt.pic.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o jimmxint.pic.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o jimmxfst.pic.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o jimmxred.pic.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o ji3dnflt.pic.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o jiss2int.pic.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o jiss2fst.pic.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o jiss2red.pic.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o jisseflt.pic.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o jiss2flt.pic.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o jsimdgcc.pic.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o jcapimin.pic.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o jcapistd.pic.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o jccoefct.pic.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o jccolor.pic.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o jcdctmgr.pic.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o jchuff.pic.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o jcinit.pic.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o jcmainct.pic.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o jcmarker.pic.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o jcmaster.pic.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o jcomapi.pic.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o jcparam.pic.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o jcphuff.pic.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o jcprepct.pic.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o jcsample.pic.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o jctrans.pic.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o jdapimin.pic.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o jdapistd.pic.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o jdatadst.pic.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o jdatasrc.pic.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o jdcoefct.pic.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o jdcolor.pic.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o jddctmgr.pic.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o jdhuff.pic.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o jdinput.pic.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o jdmainct.pic.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o jdmarker.pic.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o jdmaster.pic.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o jdmerge.pic.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o jdphuff.pic.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o jdpostct.pic.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o jdsample.pic.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o jdtrans.pic.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o jerror.pic.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o jfdctflt.pic.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o jfdctfst.pic.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o jfdctint.pic.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o jidctflt.pic.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o jidctfst.pic.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o jidctint.pic.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o jidctred.pic.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o jquant1.pic.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o jquant2.pic.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o jutils.pic.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o jmemmgr.pic.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o jmemansi.pic.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o jmemname.pic.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o jmemnobs.pic.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o jmemdos.pic.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o jmemmac.pic.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/makefile.mgw
+++ b/makefile.mgw
@@ -0,0 +1,298 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for MinGW.
+
+# Read installation instructions before saying "make" !!
+
+srcdir = .
+VPATH  = $(srcdir)
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time cc options:
+LDFLAGS= -s
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= del
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: libjpeg.a cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+libjpeg.a: $(LIBOBJECTS)
+	-$(RM) libjpeg.a
+	$(AR)  libjpeg.a  $(LIBOBJECTS)
+	$(AR2) libjpeg.a
+
+cjpeg.exe: $(COBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) libjpeg.a $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) libjpeg.a $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) libjpeg.a $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	-$(RM) *.o
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) libjpeg.a
+	-$(RM) testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-$(RM) testout*.*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/makefile.mgwdll
+++ b/makefile.mgwdll
@@ -0,0 +1,310 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for MinGW.
+# It builds the IJG library as a dynamically linkable library (.DLL),
+# and builds the sample applications which are linked against the DLL.
+
+# Read installation instructions before saying "make" !!
+
+srcdir = .
+VPATH  = $(srcdir)
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time cc options:
+LDFLAGS= -s
+LDFLAGS_DLL= $(LDFLAGS) -shared
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# DLL to build
+DLLNAME = jpeg62.dll
+# import library
+LIBNAME = libjpeg.dll.a
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= del
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: $(DLLNAME) cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+$(LIBNAME): $(DLLNAME)
+$(DLLNAME): $(LIBOBJECTS) jpegdll.o jpegdll.def
+	$(LN) $(LDFLAGS_DLL) -o $(DLLNAME) -Wl,--out-implib,$(LIBNAME) \
+		$(LIBOBJECTS) jpegdll.o jpegdll.def
+
+jpegdll.o: jpegdll.rc
+	windres -O coff -o $@ $*.rc
+
+cjpeg.exe: $(COBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) $(LIBNAME) $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) $(LIBNAME) $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) $(LIBNAME) $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	-$(RM) *.o
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) $(DLLNAME)
+	-$(RM) $(LIBNAME)
+	-$(RM) testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-$(RM) testout*.*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/makefile.unix
+++ b/makefile.unix
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension

 # This makefile is suitable for Unix-like systems with non-ANSI compilers.
 # If you have an ANSI compiler, makefile.ansi is a better starting point.
@@ -15,6 +16,13 @@ CFLAGS= -O
 # However, any special defines for ansi2knr.c may be included here:
 ANSI2KNRFLAGS= 

+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -faout -DAOUT
+
 # Link-time cc options:
 LDFLAGS= 

@@ -26,6 +34,10 @@ LDLIBS=
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= jmemnobs.o

+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -79,17 +91,23 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -139,7 +157,7 @@ jconfig.h: jconfig.doc

 clean:
 	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout*
+	$(RM) jsimdcfg.inc ansi2knr core testout*

 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -157,10 +175,63 @@ test: cjpeg djpeg jpegtran
 	cmp testorig.jpg testoutt.jpg


+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg ./makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.asm.o:
+	$(NASM) $(NAFLAGS) -o $@ $*.asm
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -171,33 +242,33 @@ jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
--- a/makefile.vc
+++ b/makefile.vc
@@ -1,32 +1,50 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension

 # This makefile is for Microsoft Visual C++ on Windows NT (and 95?).
 # It builds the IJG library as a statically linkable library (.LIB),
 # and builds the sample applications as console-mode apps.
-# Thanks to Xingong Chang, Raymond Everly and others.

 # Read installation instructions before saying "nmake" !!
-# To build an optimized library without debug info, say "nmake nodebug=1".

-# Pull in standard variable definitions
-!include <win32.mak>
+# The name of your C compiler:
+CC= cl
+LD= link

 # You may want to adjust these compiler options:
-CFLAGS= $(cflags) $(cdebug) $(cvars) -I.
+!ifdef crtdll
+# (DLL version of CRT)
+CFLAGS= -nologo -c -MD -W3 -O2 -GF -Gy -DNDEBUG -I.
+!else
+# (Single threaded static CRT)
+CFLAGS= -nologo -c -ML -W3 -O2 -GF -Gy -DNDEBUG -I.
+!endif
+
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.

+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
 # Link-time options:
-LDFLAGS= $(ldebug) $(conlflags)
+LDFLAGS= -nologo -release -subsystem:console,4.0 -opt:nowin98

 # To link any special libraries, add the necessary commands here.
-LDLIBS= $(conlibs)
+LDLIBS= 

 # Put here the object file name for the correct system-dependent memory
 # manager file.  For NT we suggest jmemnobs.obj, which expects the OS to
 # provide adequate virtual memory.
 SYSDEPMEM= jmemnobs.obj

+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+
 # miscellaneous OS-dependent stuff
 # file deletion command
 RM= del
@@ -72,18 +90,26 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
 # decompression library object files
 DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
 # These objectfiles are included in libjpeg.lib
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -94,38 +120,46 @@ DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
 TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj

 # Template command for compiling .c to .obj
-.c.obj:
-	$(cc) $(CFLAGS) $*.c
+.c.obj::
+	$(CC) $(CFLAGS) $<


 all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe

 libjpeg.lib: $(LIBOBJECTS)
-	$(RM) libjpeg.lib
+	-$(RM) libjpeg.lib
 	lib -out:libjpeg.lib  $(LIBOBJECTS)

 cjpeg.exe: $(COBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)

 djpeg.exe: $(DOBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)

 jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)

 rdjpgcom.exe: rdjpgcom.obj
-	$(link) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)

 wrjpgcom.exe: wrjpgcom.obj
-	$(link) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)


 clean:
-	$(RM) *.obj *.exe libjpeg.lib
-	$(RM) testout*
+	-$(RM) *.obj
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) libjpeg.lib
+	-if exist *.manifest $(RM) *.manifest
+	-if exist testout*   $(RM) testout*

 test: cjpeg.exe djpeg.exe jpegtran.exe
-	$(RM) testout*
+	-if exist testout* $(RM) testout*
 	.\djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
 	.\djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
 	.\cjpeg -dct int -outfile testout.jpg  testimg.ppm
@@ -140,10 +174,66 @@ test: cjpeg.exe djpeg.exe jpegtran.exe
 	fc /b testorig.jpg testoutt.jpg


+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) makecfg.c
+	$(LD) $(LDFLAGS) -out:makecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.obj
+	$(RM) makecfg.exe
+	if exist makecfg.exe.manifest $(RM) makecfg.exe.manifest
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -154,33 +244,33 @@ jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerro
 jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
--- a/makefile.vcdll
+++ b/makefile.vcdll
@@ -0,0 +1,311 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for Microsoft Visual C++ 6.0.
+# It builds the IJG library as a dynamically linkable library (.DLL),
+# and builds the sample applications which are linked against the DLL.
+
+# Read installation instructions before saying "nmake" !!
+
+# The name of your C compiler:
+CC= cl
+LD= link
+RC= rc
+
+# You may want to adjust these compiler options:
+#  You have to use a DLL version of C Run-Time library for both
+#  the JPEG DLL and any applications linked to the JPEG DLL.
+CFLAGS= -nologo -c -MD -W3 -O2 -GF -Gy -DNDEBUG -I.
+
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time options:
+LDFLAGS= -nologo -release -subsystem:console,4.0 -opt:nowin98
+LDFLAGS_DLL= -nologo -release -dll -opt:nowin98
+
+# To link any special libraries, add the necessary commands here.
+LDLIBS= 
+
+# DLL to build
+DLLNAME = jpeg62.dll
+# import library
+LIBNAME = jpeg62.lib
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.  For NT we suggest jmemnobs.obj, which expects the OS to
+# provide adequate virtual memory.
+SYSDEPMEM= jmemnobs.obj
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+
+# miscellaneous OS-dependent stuff
+# file deletion command
+RM= del
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
+        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
+        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
+# decompression library object files
+DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
+        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
+# These objectfiles are included in libjpeg.lib
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
+        rdswitch.obj cdjpeg.obj
+DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
+        rdcolmap.obj cdjpeg.obj
+TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
+
+# Template command for compiling .c to .obj
+.c.obj::
+	$(CC) $(CFLAGS) $<
+
+
+all: $(DLLNAME) cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+$(LIBNAME): $(DLLNAME)
+$(DLLNAME): $(LIBOBJECTS) jpegdll.res jpegdll.def
+	$(LD) $(LDFLAGS_DLL) -out:$(DLLNAME) -implib:$(LIBNAME) \
+		$(LIBOBJECTS) jpegdll.res -def:jpegdll.def
+
+jpegdll.res: jpegdll.rc
+	$(RC) -fo $@ $*.rc
+
+cjpeg.exe: $(COBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) $(LIBNAME) $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) $(LIBNAME) $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) $(LIBNAME) $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.obj
+	$(LD) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.obj
+	$(LD) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+
+
+clean:
+	-$(RM) *.obj
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) jpegdll.res
+	-$(RM) $(DLLNAME)
+	-$(RM) $(DLLNAME:.dll=.exp)
+	-$(RM) $(LIBNAME)
+	-if exist *.manifest $(RM) *.manifest
+	-if exist testout*   $(RM) testout*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-if exist testout* $(RM) testout*
+	.\djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
+	.\djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
+	.\cjpeg -dct int -outfile testout.jpg  testimg.ppm
+	.\djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
+	.\cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
+	.\jpegtran -outfile testoutt.jpg testprog.jpg
+	fc /b testimg.ppm testout.ppm
+	fc /b testimg.bmp testout.bmp
+	fc /b testimg.jpg testout.jpg
+	fc /b testimg.ppm testoutp.ppm
+	fc /b testimgp.jpg testoutp.jpg
+	fc /b testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) makecfg.c
+	$(LD) $(LDFLAGS) -out:makecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.obj
+	$(RM) makecfg.exe
+	if exist makecfg.exe.manifest $(RM) makecfg.exe.manifest
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/Show More
+++ b/Show More