From 6916f8795542e43e322ee5df88858665f24bd505 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 3 Feb 2012 08:59:21 +0000
Subject: [PATCH 01/26] Several clarifications regarding v7/v8 API/ABI support

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@776 632fc199-4ca6-4c93-a231-07263d6284db
---
 README-turbo.txt | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/README-turbo.txt b/README-turbo.txt
index 24490e39..bdf2e10b 100755
--- a/README-turbo.txt
+++ b/README-turbo.txt
@@ -256,24 +256,17 @@ for the existence of the colorspace extensions at compile time and run time.
 libjpeg v7 and v8 API/ABI support
 =================================
 
-libjpeg v7 and v8 added new features to the API/ABI, and, unfortunately, the
-compression and decompression structures were extended in a backward-
-incompatible manner to accommodate these features.  Thus, programs that are
+With libjpeg v7 and v8, new features were added that necessitated extending the
+compression and decompression structures.  Unfortunately, due to the exposed
+nature of those structures, extending them also necessitated breaking backward
+ABI compatibility with previous libjpeg releases.  Thus, programs that are
 built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
 based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are still not
 as widely used as v6b, enough programs (including a few Linux distros) have
 made the switch that it was desirable to provide support for the libjpeg v7/v8
-API/ABI in libjpeg-turbo.
-
-Some of the libjpeg v7 and v8 features -- DCT scaling, to name one -- involve
-deep modifications to the code that cannot be accommodated by libjpeg-turbo
-without either breaking compatibility with libjpeg v6b or producing an
-unsupportable mess.  In order to fully support libjpeg v8 with all of its
-features, we would have to essentially port the SIMD extensions to the libjpeg
-v8 code base and maintain two separate code trees.  We are hesitant to do this
-until/unless the newer libjpeg code bases garner more community support and
-involvement and until/unless we have some notion of whether future libjpeg
-releases will also be backward-incompatible.
+API/ABI in libjpeg-turbo.  Although libjpeg-turbo can now be configured as a
+drop-in replacement for libjpeg v7 or v8, it should be noted that not all of
+the features in libjpeg v7 and v8 are supported (see below.)
 
 By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
 argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
@@ -312,6 +305,16 @@ Not supported:
 
 -- libjpeg: DCT scaling in compressor
    cinfo.scale_num and cinfo.scale_denom are silently ignored.
+   There is no technical reason why DCT scaling cannot be supported, but
+   without the SmartScale extension (see below), it would only be able to
+   down-scale using ratios of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and 8/9,
+   which is of limited usefulness.
+
+-- libjpeg: SmartScale
+   cinfo.block_size is silently ignored.
+   SmartScale is an extension to the JPEG format that allows for DCT block
+   sizes other than 8x8.  It would be difficult to support this feature while
+   retaining backward compatibility with libjpeg v6b.
 
 -- libjpeg: IDCT scaling extensions in decompressor
    libjpeg-turbo still supports IDCT scaling with scaling factors of 1/2, 1/4,
@@ -319,9 +322,14 @@ Not supported:
 
 -- libjpeg: Fancy downsampling in compressor
    cinfo.do_fancy_downsampling is silently ignored.
+   This requires the DCT scaling feature, which is not supported.
 
 -- jpegtran: Scaling
-   Seems to depend on the DCT scaling feature, which isn't supported.
+   This requires both the DCT scaling and SmartScale features, which are not
+   supported.
+
+-- Lossless RGB JPEG files
+   This requires the SmartScale feature, which is not supported.
 
 
 *******************************************************************************

From 11a122b2d66faeaa0e55b5cceecc60ce6049c073 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 7 Feb 2012 00:14:53 +0000
Subject: [PATCH 02/26] Wordsmithing and grammar

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@777 632fc199-4ca6-4c93-a231-07263d6284db
---
 README-turbo.txt | 71 ++++++++++++++++++++++++------------------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/README-turbo.txt b/README-turbo.txt
index bdf2e10b..899a3681 100755
--- a/README-turbo.txt
+++ b/README-turbo.txt
@@ -3,31 +3,31 @@
 *******************************************************************************
 
 libjpeg-turbo is a derivative of libjpeg that uses SIMD instructions (MMX,
-SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86
-and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast
-as the unmodified version of libjpeg, all else being equal.
+SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86,
+x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as
+fast as the unmodified version of libjpeg, all else being equal.
 
 libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
 the TigerVNC and VirtualGL projects made numerous enhancements to the codec in
 2009, including improved support for Mac OS X, 64-bit support, support for
-32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
-encoding/decoding, and various bug fixes.  The goal was to produce a fully open
-source codec that could replace the partially closed source TurboJPEG/IPP codec
-used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range
-of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.
+32-bit and big-endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
+encoding/decoding, and various bug fixes.  The goal was to produce a fully
+open-source codec that could replace the partially closed-source TurboJPEG/IPP
+codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally achieves 80-120%
+of the performance of TurboJPEG/IPP.  It is faster in some areas but slower in
+others.
 
 In early 2010, libjpeg-turbo spun off into its own independent project, with
 the goal of making high-speed JPEG compression/decompression technology
-available to a broader range of users and developers.  The libjpeg-turbo shared
-libraries can be used as drop-in replacements for libjpeg on most systems.
+available to a broader range of users and developers.
 
 
 *******************************************************************************
 **     License
 *******************************************************************************
 
-libjpeg-turbo is licensed under a non-restrictive, BSD-style license
-(see README.)  The TurboJPEG/OSS wrapper (both C and Java versions) and
+Most of libjpeg-turbo inherits the non-restrictive, BSD-style license used by
+libjpeg (see README.)  The TurboJPEG/OSS wrapper (both C and Java versions) and
 associated test programs bear a similar license, which is reproduced below:
 
 Redistribution and use in source and binary forms, with or without
@@ -62,14 +62,14 @@ POSSIBILITY OF SUCH DAMAGE.
 libjpeg-turbo includes two APIs that can be used to compress and decompress
 JPEG images:
 
-  TurboJPEG API:  This API wraps libjpeg-turbo and provides an easy-to-use
-  interface for compressing and decompressing JPEG images in memory.  It also
-  provides some features that would not be straightforward to implement using
-  the underlying libjpeg API, such as generating planar YUV images and
-  performing multiple simultaneous lossless transforms on an image.  The Java
-  interface for libjpeg-turbo is written on top of the TurboJPEG API.
+  TurboJPEG API:  This API provides an easy-to-use interface for compressing
+  and decompressing JPEG images in memory.  It also provides some functionality
+  that would not be straightforward to achieve using the underlying libjpeg
+  API, such as generating planar YUV images and performing multiple
+  simultaneous lossless transforms on an image.  The Java interface for
+  libjpeg-turbo is written on top of the TurboJPEG API.
 
-  libjpeg API:  This is the industry standard API for compressing and
+  libjpeg API:  This is the de facto industry-standard API for compressing and
   decompressing JPEG images.  It is more difficult to use than the TurboJPEG
   API but also more powerful.  libjpeg-turbo is both API/ABI-compatible and
   mathematically compatible with libjpeg v6b.  It can also optionally be
@@ -101,13 +101,13 @@ NOTE: {lib} can be lib, lib32, lib64, or lib/64, depending on the O/S and
 architecture.
 
 System administrators can also replace the libjpeg sym links in /usr/{lib} with
-links to the libjpeg dynamic library located in /opt/libjpeg-turbo/{lib}.  This
-will effectively accelerate every dynamically linked libjpeg application on the
-system.
+links to the libjpeg-turbo dynamic library located in /opt/libjpeg-turbo/{lib}.
+This will effectively accelerate every application that uses the libjpeg
+dynamic library on the system.
 
 The libjpeg-turbo SDK for Visual C++ installs the libjpeg-turbo DLL
-(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether libjpeg v6b, v7, or
-v8 emulation is enabled) into c:\libjpeg-turbo[64]\bin, and the PATH
+(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether it was built with
+libjpeg v6b, v7, or v8 emulation) into c:\libjpeg-turbo[64]\bin, and the PATH
 environment variable can be modified such that this directory is searched
 before any others that might contain a libjpeg DLL.  However, if a libjpeg
 DLL exists in an application's install directory, then Windows will load this
@@ -117,16 +117,16 @@ version of this DLL and copy c:\libjpeg-turbo[64]\bin\jpeg*.dll into the
 application's install directory to accelerate it.
 
 The version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
-Visual C++ requires the Visual C++ 2008 C run time DLL (msvcr90.dll).
+Visual C++ requires the Visual C++ 2008 C run-time DLL (msvcr90.dll).
 msvcr90.dll ships with more recent versions of Windows, but users of older
 Windows releases can obtain it from the Visual C++ 2008 Redistributable
 Package, which is available as a free download from Microsoft's web site.
 
-NOTE:  Features of libjpeg that require passing a C run time structure, such
+NOTE:  Features of libjpeg that require passing a C run-time structure, such
 as a file handle, from an application to libjpeg will probably not work with
 the version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
 Visual C++, unless the application is also built to use the Visual C++ 2008 C
-run time DLL.  In particular, this affects jpeg_stdio_dest() and
+run-time DLL.  In particular, this affects jpeg_stdio_dest() and
 jpeg_stdio_src().
 
 Mac applications typically embed their own copies of the libjpeg dylib inside
@@ -146,7 +146,7 @@ Replacing TurboJPEG/IPP
 libjpeg-turbo is a drop-in replacement for the TurboJPEG/IPP SDK used by
 VirtualGL 2.1.x and TurboVNC 0.6 (and prior.)  libjpeg-turbo contains a wrapper
 library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
-instead of the closed source Intel Performance Primitives.  You can replace the
+instead of the closed-source Intel Performance Primitives.  You can replace the
 TurboJPEG/IPP package on Linux systems with the libjpeg-turbo package in order
 to make existing releases of VirtualGL 2.1.x and TurboVNC 0.x use the new codec
 at run time.  Note that the 64-bit libjpeg-turbo packages contain only 64-bit
@@ -157,7 +157,7 @@ both the 64-bit and 32-bit versions of libjpeg-turbo.
 You can also build the VirtualGL 2.1.x and TurboVNC 0.6 source code with
 the libjpeg-turbo SDK instead of TurboJPEG/IPP.  It should work identically.
 libjpeg-turbo also includes static library versions of TurboJPEG/OSS, which
-are used to build TurboVNC 1.0 and later.
+are used to build VirtualGL 2.2 and TurboVNC 1.0 and later.
 
 ========================================
 Using libjpeg-turbo in Your Own Programs
@@ -341,12 +341,13 @@ Restart Markers
 ===============
 
 The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
-in a way that makes libjpeg happy, so it is necessary to use the slow Huffman
-decoder when decompressing a JPEG image that has restart markers.  This can
-cause the decompression performance to drop by as much as 20%, but the
-performance will still be much much greater than that of libjpeg v6b.  Many
-consumer packages, such as PhotoShop, use restart markers when generating JPEG
-images, so images generated by those programs will experience this issue.
+in a way that makes the rest of the libjpeg infrastructure happy, so it is
+necessary to use the slow Huffman decoder when decompressing a JPEG image that
+has restart markers.  This can cause the decompression performance to drop by
+as much as 20%, but the performance will still be much greater than that of
+libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+generating JPEG images, so images generated by those programs will experience
+this issue.
 
 ===============================================
 Fast Integer Forward DCT at High Quality Levels

From 760ea8dfb38cea7925b663cd392ed737a40bf078 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 7 Feb 2012 23:25:19 +0000
Subject: [PATCH 03/26] Merge documentation and wordsmithing changes from 1.2,
 including promotion of -arithmetic to a "switch for advanced users"

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.1.x@783 632fc199-4ca6-4c93-a231-07263d6284db
---
 BUILDING.txt     | 24 +++++++-------
 CMakeLists.txt   |  4 ++-
 README           | 19 ++++++-----
 README-turbo.txt | 85 ++++++++++++++++++++++++++----------------------
 cjpeg.1          | 16 ++++-----
 cjpeg.c          |  6 ++--
 install.txt      | 10 +++---
 jpegtran.1       |  4 +--
 jpegtran.c       |  6 ++--
 libjpeg.txt      | 23 +++++++------
 structure.txt    | 16 ++++-----
 usage.txt        | 16 ++++-----
 12 files changed, 122 insertions(+), 107 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index 70fa4005..01f67c16 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -99,14 +99,14 @@ This will generate the following files under .libs/
 62, 7, or 8.
 
 
-libjpeg v7 or v8 Emulation
---------------------------
+libjpeg v7 or v8 API/ABI Emulation
+----------------------------------
 
 Add --with-jpeg7 to the configure command line to build a version of
-libjpeg-turbo that is compatible with libjpeg v7.  Add --with-jpeg8 to the
-configure command to build a version of libjpeg-turbo that is compatible with
-libjpeg v8.  See README-turbo.txt for more information on libjpeg v7 and v8
-emulation.
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add --with-jpeg8 to
+the configure command to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
+on libjpeg v7 and v8 emulation.
 
 
 Arithmetic Coding Support
@@ -416,14 +416,14 @@ NMake, remove "-DCMAKE_BUILD_TYPE=Release" (Debug builds are the default with
 NMake.)
 
 
-libjpeg v7 or v8 Emulation
---------------------------
+libjpeg v7 or v8 API/ABI Emulation
+-----------------------------------
 
 Add "-DWITH_JPEG7=1" to the cmake command line to build a version of
-libjpeg-turbo that is compatible with libjpeg v7.  Add "-DWITH_JPEG8=1" to the
-cmake command to build a version of libjpeg-turbo that is compatible with
-libjpeg v8.  See README-turbo.txt for more information on libjpeg v7 and v8
-emulation.
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add "-DWITH_JPEG8=1"
+to the cmake command to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See README-turbo.txt for more information
+on libjpeg v7 and v8 emulation.
 
 
 Arithmetic Coding Support
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f703acf3..28f8e000 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,8 @@ else()
   message(FATAL_ERROR "Platform not supported by this build system.  Use autotools instead.")
 endif()
 
+# This does nothing except when using MinGW.  CMAKE_BUILD_TYPE has no meaning
+# in Visual Studio, and it always defaults to Debug when using NMake.
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
@@ -25,7 +27,7 @@ endif()
 message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")
 
 # This only works if building from the command line.  There is currently no way
-# to set a variable's value based on the build type when using the MSVC IDE.
+# to set a variable's value based on the build type when using Visual Studio.
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   set(BUILD "${BUILD}d")
 endif()
diff --git a/README b/README
index 2ead09e6..0e9b4295 100644
--- a/README
+++ b/README
@@ -1,7 +1,8 @@
-libjpeg-turbo note:  This file is mostly taken from the libjpeg v8b README
-file, and it is included only for reference.  Some parts of it may not apply to
-libjpeg-turbo.  Please see README-turbo.txt for information specific to the
-turbo version.
+libjpeg-turbo note:  This file contains portions of the libjpeg v6b and v8
+README files, with additional wordsmithing by The libjpeg-turbo Project.
+It is included only for reference, as some parts of it may not apply to
+libjpeg-turbo.  Please see README-turbo.txt for information specific to
+libjpeg-turbo.
 
 
 The Independent JPEG Group's JPEG software
@@ -62,7 +63,7 @@ OVERVIEW
 This package contains C software to implement JPEG image encoding, decoding,
 and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
 method for full-color and gray-scale images.  JPEG's strong suit is compressing
-photographic images or other types of images which have smooth color and
+photographic images or other types of images that have smooth color and
 brightness transitions between neighboring pixels.  Images with sharp lines or
 other abrupt features may not compress well with JPEG, and a higher JPEG
 quality may have to be used to avoid visible compression artifacts with such
@@ -256,8 +257,8 @@ ARCHIVE LOCATIONS
 The "official" archive site for this software is www.ijg.org.
 The most recent released version can always be found there in
 directory "files".  This particular version will be archived as
-http://www.ijg.org/files/jpegsrc.v8b.tar.gz, and in Windows-compatible
-"zip" archive format as http://www.ijg.org/files/jpegsr8b.zip.
+http://www.ijg.org/files/jpegsrc.v8d.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr8d.zip.
 
 The JPEG FAQ (Frequently Asked Questions) article is a source of some
 general information about JPEG.
@@ -274,7 +275,7 @@ FILE FORMAT WARS
 ================
 
 The ISO JPEG standards committee actually promotes different formats like
-"JPEG 2000" or "JPEG XR" which are incompatible with original DCT-based
+"JPEG 2000" or "JPEG XR", which are incompatible with original DCT-based
 JPEG.  IJG therefore does not support these formats (see REFERENCES).  Indeed,
 one of the original reasons for developing this free software was to help
 force convergence on common, interoperable format standards for JPEG files.
@@ -286,4 +287,4 @@ image files indefinitely.)
 TO DO
 =====
 
-Please send bug reports, offers of help, etc. to jpeg-info@uc.ag.
+Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org.
diff --git a/README-turbo.txt b/README-turbo.txt
index a385270c..fcfd27e5 100755
--- a/README-turbo.txt
+++ b/README-turbo.txt
@@ -10,16 +10,16 @@ as the unmodified version of libjpeg, all else being equal.
 libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
 the TigerVNC and VirtualGL projects made numerous enhancements to the codec in
 2009, including improved support for Mac OS X, 64-bit support, support for
-32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
-encoding/decoding, and various bug fixes.  The goal was to produce a fully open
-source codec that could replace the partially closed source TurboJPEG/IPP codec
-used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range
-of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.
+32-bit and big-endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
+encoding/decoding, and various bug fixes.  The goal was to produce a fully
+open-source codec that could replace the partially closed-source TurboJPEG/IPP
+codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally achieves 80-120%
+of the performance of TurboJPEG/IPP.  It is faster in some areas but slower in
+others.
 
 In early 2010, libjpeg-turbo spun off into its own independent project, with
 the goal of making high-speed JPEG compression/decompression technology
-available to a broader range of users and developers.  The libjpeg-turbo shared
-libraries can be used as drop-in replacements for libjpeg on most systems.
+available to a broader range of users and developers.
 
 
 *******************************************************************************
@@ -72,13 +72,13 @@ NOTE: {lib} can be lib, lib32, lib64, or lib/64, depending on the O/S and
 architecture.
 
 System administrators can also replace the libjpeg sym links in /usr/{lib} with
-links to the libjpeg dynamic library located in /opt/libjpeg-turbo/{lib}.  This
-will effectively accelerate every dynamically linked libjpeg application on the
-system.
+links to the libjpeg-turbo dynamic library located in /opt/libjpeg-turbo/{lib}.
+This will effectively accelerate every application that uses the libjpeg
+dynamic library on the system.
 
 The libjpeg-turbo SDK for Visual C++ installs the libjpeg-turbo DLL
-(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether libjpeg v6b, v7, or
-v8 emulation is enabled) into c:\libjpeg-turbo[64]\bin, and the PATH
+(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether it was built with
+libjpeg v6b, v7, or v8 emulation) into c:\libjpeg-turbo[64]\bin, and the PATH
 environment variable can be modified such that this directory is searched
 before any others that might contain a libjpeg DLL.  However, if a libjpeg
 DLL exists in an application's install directory, then Windows will load this
@@ -88,16 +88,16 @@ version of this DLL and copy c:\libjpeg-turbo[64]\bin\jpeg*.dll into the
 application's install directory to accelerate it.
 
 The version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
-Visual C++ requires the Visual C++ 2008 C run time DLL (msvcr90.dll).
+Visual C++ requires the Visual C++ 2008 C run-time DLL (msvcr90.dll).
 msvcr90.dll ships with more recent versions of Windows, but users of older
 Windows releases can obtain it from the Visual C++ 2008 Redistributable
 Package, which is available as a free download from Microsoft's web site.
 
-NOTE:  Features of libjpeg that require passing a C run time structure, such
+NOTE:  Features of libjpeg that require passing a C run-time structure, such
 as a file handle, from an application to libjpeg will probably not work with
 the version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
 Visual C++, unless the application is also built to use the Visual C++ 2008 C
-run time DLL.  In particular, this affects jpeg_stdio_dest() and
+run-time DLL.  In particular, this affects jpeg_stdio_dest() and
 jpeg_stdio_src().
 
 Mac applications typically embed their own copies of the libjpeg dylib inside
@@ -117,7 +117,7 @@ Replacing TurboJPEG/IPP
 libjpeg-turbo is a drop-in replacement for the TurboJPEG/IPP SDK used by
 VirtualGL 2.1.x and TurboVNC 0.6 (and prior.)  libjpeg-turbo contains a wrapper
 library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
-instead of the closed source Intel Performance Primitives.  You can replace the
+instead of the closed-source Intel Performance Primitives.  You can replace the
 TurboJPEG/IPP package on Linux systems with the libjpeg-turbo package in order
 to make existing releases of VirtualGL 2.1.x and TurboVNC 0.x use the new codec
 at run time.  Note that the 64-bit libjpeg-turbo packages contain only 64-bit
@@ -128,7 +128,7 @@ both the 64-bit and 32-bit versions of libjpeg-turbo.
 You can also build the VirtualGL 2.1.x and TurboVNC 0.6 source code with
 the libjpeg-turbo SDK instead of TurboJPEG/IPP.  It should work identically.
 libjpeg-turbo also includes static library versions of TurboJPEG/OSS, which
-are used to build TurboVNC 1.0 and later.
+are used to build VirtualGL 2.2 and TurboVNC 1.0 and later.
 
 ========================================
 Using libjpeg-turbo in Your Own Programs
@@ -208,24 +208,17 @@ that doesn't support them will result in a "Bogus input colorspace" error.
 libjpeg v7 and v8 API/ABI support
 =================================
 
-libjpeg v7 and v8 added new features to the API/ABI, and, unfortunately, the
-compression and decompression structures were extended in a backward-
-incompatible manner to accommodate these features.  Thus, programs that are
+With libjpeg v7 and v8, new features were added that necessitated extending the
+compression and decompression structures.  Unfortunately, due to the exposed
+nature of those structures, extending them also necessitated breaking backward
+ABI compatibility with previous libjpeg releases.  Thus, programs that are
 built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
 based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are still not
 as widely used as v6b, enough programs (including a few Linux distros) have
 made the switch that it was desirable to provide support for the libjpeg v7/v8
-API/ABI in libjpeg-turbo.
-
-Some of the libjpeg v7 and v8 features -- DCT scaling, to name one -- involve
-deep modifications to the code that cannot be accommodated by libjpeg-turbo
-without either breaking compatibility with libjpeg v6b or producing an
-unsupportable mess.  In order to fully support libjpeg v8 with all of its
-features, we would have to essentially port the SIMD extensions to the libjpeg
-v8 code base and maintain two separate code trees.  We are hesitant to do this
-until/unless the newer libjpeg code bases garner more community support and
-involvement and until/unless we have some notion of whether future libjpeg
-releases will also be backward-incompatible.
+API/ABI in libjpeg-turbo.  Although libjpeg-turbo can now be configured as a
+drop-in replacement for libjpeg v7 or v8, it should be noted that not all of
+the features in libjpeg v7 and v8 are supported (see below.)
 
 By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
 argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
@@ -264,6 +257,16 @@ Not supported:
 
 -- libjpeg: DCT scaling in compressor
    cinfo.scale_num and cinfo.scale_denom are silently ignored.
+   There is no technical reason why DCT scaling cannot be supported, but
+   without the SmartScale extension (see below), it would only be able to
+   down-scale using ratios of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and 8/9,
+   which is of limited usefulness.
+
+-- libjpeg: SmartScale
+   cinfo.block_size is silently ignored.
+   SmartScale is an extension to the JPEG format that allows for DCT block
+   sizes other than 8x8.  It would be difficult to support this feature while
+   retaining backward compatibility with libjpeg v6b.
 
 -- libjpeg: IDCT scaling extensions in decompressor
    libjpeg-turbo still supports IDCT scaling with scaling factors of 1/2, 1/4,
@@ -271,9 +274,14 @@ Not supported:
 
 -- libjpeg: Fancy downsampling in compressor
    cinfo.do_fancy_downsampling is silently ignored.
+   This requires the DCT scaling feature, which is not supported.
 
 -- jpegtran: Scaling
-   Seems to depend on the DCT scaling feature, which isn't supported.
+   This requires both the DCT scaling and SmartScale features, which are not
+   supported.
+
+-- Lossless RGB JPEG files
+   This requires the SmartScale feature, which is not supported.
 
 
 *******************************************************************************
@@ -285,12 +293,13 @@ Restart Markers
 ===============
 
 The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
-in a way that makes libjpeg happy, so it is necessary to use the slow Huffman
-decoder when decompressing a JPEG image that has restart markers.  This can
-cause the decompression performance to drop by as much as 20%, but the
-performance will still be much much greater than that of libjpeg v6b.  Many
-consumer packages, such as PhotoShop, use restart markers when generating JPEG
-images, so images generated by those programs will experience this issue.
+in a way that makes the rest of the libjpeg infrastructure happy, so it is
+necessary to use the slow Huffman decoder when decompressing a JPEG image that
+has restart markers.  This can cause the decompression performance to drop by
+as much as 20%, but the performance will still be much greater than that of
+libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+generating JPEG images, so images generated by those programs will experience
+this issue.
 
 ===============================================
 Fast Integer Forward DCT at High Quality Levels
diff --git a/cjpeg.1 b/cjpeg.1
index e4d95ee8..6fb72993 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -1,4 +1,4 @@
-.TH CJPEG 1 "11 October 2010"
+.TH CJPEG 1 "31 January 2012"
 .SH NAME
 cjpeg \- compress an image file to a JPEG file
 .SH SYNOPSIS
@@ -121,7 +121,7 @@ quality-sensitive applications, for which the artifacts generated by
 subsampling may be unacceptable.
 .PP
 The \fB-quality\fR option accepts a comma-separated list of parameters, which
-respectively refer to the quality levels which should be assigned to the
+respectively refer to the quality levels that should be assigned to the
 quantization table slots.  If there are more q-table slots than parameters,
 then the last parameter is replicated.  Thus, if only one quality parameter is
 given, this is used for both luminance and chrominance (slots 0 and 1,
@@ -150,6 +150,12 @@ about the same --- often a little smaller.
 .PP
 Switches for advanced users:
 .TP
+.B \-arithmetic
+Use arithmetic coding.
+.B Caution:
+arithmetic coded JPEG is not yet widely implemented, so many decoders will be
+unable to view an arithmetic coded JPEG file at all.
+.TP
 .B \-dct int
 Use integer DCT method (default).
 .TP
@@ -214,12 +220,6 @@ visibly blur the image, however.
 .PP
 Switches for wizards:
 .TP
-.B \-arithmetic
-Use arithmetic coding.
-.B Caution:
-arithmetic coded JPEG is not yet widely implemented, so many decoders will be
-unable to view an arithmetic coded JPEG file at all.
-.TP
 .B \-baseline
 Force baseline-compatible quantization tables to be generated.  This clamps
 quantization values to 8 bits even at low quality settings.  (This switch is
diff --git a/cjpeg.c b/cjpeg.c
index 6f407f83..0475c023 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -164,6 +164,9 @@ usage (void)
   fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
 #endif
   fprintf(stderr, "Switches for advanced users:\n");
+#ifdef C_ARITH_CODING_SUPPORTED
+  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
+#endif
 #ifdef DCT_ISLOW_SUPPORTED
   fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
 	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
@@ -184,9 +187,6 @@ usage (void)
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "Switches for wizards:\n");
-#ifdef C_ARITH_CODING_SUPPORTED
-  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
-#endif
   fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
   fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
   fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
diff --git a/install.txt b/install.txt
index 2ee86adf..1327dc48 100644
--- a/install.txt
+++ b/install.txt
@@ -534,17 +534,17 @@ In general, it's worth trying the maximum optimization level of your compiler,
 and experimenting with any optional optimizations such as loop unrolling.
 (Unfortunately, far too many compilers have optimizer bugs ... be prepared to
 back off if the code fails self-test.)  If you do any experimentation along
-these lines, please report the optimal settings to jpeg-info@uc.ag so we
-can mention them in future releases.  Be sure to specify your machine
-and compiler version.
+these lines, please report the optimal settings to jpeg-info@jpegclub.org so
+we can mention them in future releases.  Be sure to specify your machine and
+compiler version.
 
 
 HINTS FOR SPECIFIC SYSTEMS
 ==========================
 
 We welcome reports on changes needed for systems not mentioned here.  Submit
-'em to jpeg-info@uc.ag.  Also, if configure or ckconfig.c is wrong about how
-to configure the JPEG software for your system, please let us know.
+'em to jpeg-info@jpegclub.org.  Also, if configure or ckconfig.c is wrong
+about how to configure the JPEG software for your system, please let us know.
 
 
 Acorn RISC OS:
diff --git a/jpegtran.1 b/jpegtran.1
index f10cdbb6..160b47da 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "11 October 2010"
+.TH JPEGTRAN 1 "31 January 2012"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -184,7 +184,7 @@ comments and other excess baggage present in the source file.
 .TP
 .B \-copy comments
 Copy only comment markers.  This setting copies comments from the source file
-but discards any other data which is inessential for image display.
+but discards any other data that is inessential for image display.
 .TP
 .B \-copy all
 Copy all extra markers.  This setting preserves miscellaneous markers
diff --git a/jpegtran.c b/jpegtran.c
index b2a31adb..40d4b6cb 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -78,14 +78,14 @@ usage (void)
   fprintf(stderr, "  -trim          Drop non-transformable edge blocks\n");
 #endif
   fprintf(stderr, "Switches for advanced users:\n");
+#ifdef C_ARITH_CODING_SUPPORTED
+  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
+#endif
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "Switches for wizards:\n");
-#ifdef C_ARITH_CODING_SUPPORTED
-  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
-#endif
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
 #endif
diff --git a/libjpeg.txt b/libjpeg.txt
index 191b35e8..d350fc73 100644
--- a/libjpeg.txt
+++ b/libjpeg.txt
@@ -850,8 +850,9 @@ int jpeg_quality_scaling (int quality)
 	premise of this routine collapses.  Caveat user.
 
 jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
-	[libjpeg v7/v8 only] Set default quantization tables with linear
-	q_scale_factor[] values (see below).
+	[libjpeg v7+ API/ABI emulation only]
+	Set default quantization tables with linear q_scale_factor[] values
+	(see below).
 
 jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 		      const unsigned int *basic_table,
@@ -976,12 +977,13 @@ JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]
 	slot 1 for chrominance.
 
 int q_scale_factor[NUM_QUANT_TBLS]
-	[libjpeg v7+ only] Linear quantization scaling factors (0-100, default
-	100) for use with jpeg_default_qtables().
+	[libjpeg v7+ API/ABI emulation only]
+	Linear quantization scaling factors (0-100, default 100)
+	for use with jpeg_default_qtables().
 	See rdswitch.c and cjpeg.c for an example of usage.
 	Note that the q_scale_factor[] values use "linear" scales, so JPEG
 	quality levels chosen by the user must be converted to these scales
-	using jpeg_quality_scaling().  Here is an example which corresponds to
+	using jpeg_quality_scaling().  Here is an example that corresponds to
 	cjpeg -quality 90,70:
 
 		jpeg_set_defaults(cinfo);
@@ -1012,11 +1014,12 @@ JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
 	any need to mess with providing your own Huffman tables.
 
 
-[libjpeg v7+ only] The actual dimensions of the JPEG image that will be written
-to the file are given by the following fields.  These are computed from the
-input image dimensions and the compression parameters by jpeg_start_compress().
-You can also call jpeg_calc_jpeg_dimensions() to obtain the values that will
-result from the current parameter settings.
+[libjpeg v7+ API/ABI emulation only]
+The actual dimensions of the JPEG image that will be written to the file are
+given by the following fields.  These are computed from the input image
+dimensions and the compression parameters by jpeg_start_compress().  You can
+also call jpeg_calc_jpeg_dimensions() to obtain the values that will result
+from the current parameter settings.
 
 JDIMENSION jpeg_width		Actual dimensions of output image.
 JDIMENSION jpeg_height
diff --git a/structure.txt b/structure.txt
index 779233a8..6a9266ba 100644
--- a/structure.txt
+++ b/structure.txt
@@ -1,6 +1,6 @@
 IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
 
-Copyright (C) 1991-2009, Thomas G. Lane, Guido Vollbeding.
+Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
 This file is part of the Independent JPEG Group's software.
 For conditions of distribution and use, see the accompanying README file.
 
@@ -385,8 +385,9 @@ objects:
 
 * Data destination manager: writes the output JPEG datastream to its final
   destination (e.g., a file).  The destination manager supplied with the
-  library knows how to write to a stdio stream; for other behaviors, the
-  surrounding application may provide its own destination manager.
+  library knows how to write to a stdio stream or to a memory buffer;
+  for other behaviors, the surrounding application may provide its own
+  destination manager.
 
 * Memory manager: allocates and releases memory, controls virtual arrays
   (with backing store management, where required).
@@ -504,9 +505,9 @@ objects:
 * Marker reading: decodes JPEG markers (except for RSTn).
 
 * Data source manager: supplies the input JPEG datastream.  The source
-  manager supplied with the library knows how to read from a stdio stream;
-  for other behaviors, the surrounding application may provide its own source
-  manager.
+  manager supplied with the library knows how to read from a stdio stream
+  or from a memory buffer;  for other behaviors, the surrounding application
+  may provide its own source manager.
 
 * Memory manager: same as for compression library.
 
@@ -586,8 +587,7 @@ as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
 With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
 simplify correct rounding during downsampling, etc.  The JPEG standard's
 specification that sample values run from -128..127 is accommodated by
-subtracting 128 just as the sample value is copied into the source array for
-the DCT step (this will be an array of signed ints).  Similarly, during
+subtracting 128 from the sample value in the DCT step.  Similarly, during
 decompression the output of the IDCT step will be immediately shifted back to
 0..255.  (NB: different values are required when 12-bit samples are in use.
 The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
diff --git a/usage.txt b/usage.txt
index 7af75a8c..2abfbeab 100644
--- a/usage.txt
+++ b/usage.txt
@@ -131,7 +131,7 @@ quality-sensitive applications, for which the artifacts generated by
 subsampling may be unacceptable.
 
 The -quality option accepts a comma-separated list of parameters, which
-respectively refer to the quality levels which should be assigned to the
+respectively refer to the quality levels that should be assigned to the
 quantization table slots.  If there are more q-table slots than parameters,
 then the last parameter is replicated.  Thus, if only one quality parameter is
 given, this is used for both luminance and chrominance (slots 0 and 1,
@@ -157,6 +157,11 @@ file size is about the same --- often a little smaller.
 
 Switches for advanced users:
 
+	-arithmetic	Use arithmetic coding.  CAUTION: arithmetic coded JPEG
+			is not yet widely implemented, so many decoders will
+			be unable to view an arithmetic coded JPEG file at
+			all.
+
 	-dct int	Use integer DCT method (default).
 	-dct fast	Use fast integer DCT (less accurate).
 	-dct float	Use floating-point DCT method.
@@ -201,11 +206,6 @@ factor will visibly blur the image, however.
 
 Switches for wizards:
 
-	-arithmetic	Use arithmetic coding.  CAUTION: arithmetic coded JPEG
-			is not yet widely implemented, so many decoders will
-			be unable to view an arithmetic coded JPEG file at
-			all.
-
 	-baseline	Force baseline-compatible quantization tables to be
 			generated.  This clamps quantization values to 8 bits
 			even at low quality settings.  (This switch is poorly
@@ -447,9 +447,9 @@ To specify the coded JPEG representation used in the output file,
 jpegtran accepts a subset of the switches recognized by cjpeg:
 	-optimize	Perform optimization of entropy encoding parameters.
 	-progressive	Create progressive JPEG file.
+	-arithmetic	Use arithmetic coding.
 	-restart N	Emit a JPEG restart marker every N MCU rows, or every
 			N MCU blocks if "B" is attached to the number.
-	-arithmetic	Use arithmetic coding.
 	-scans file	Use the scan script given in the specified text file.
 See the previous discussion of cjpeg for more details about these switches.
 If you specify none of these switches, you get a plain baseline-JPEG output
@@ -527,7 +527,7 @@ markers, such as comment blocks:
 			present in the source file.
 	-copy comments	Copy only comment markers.  This setting copies
 			comments from the source file but discards
-			any other data which is inessential for image display.
+			any other data that is inessential for image display.
 	-copy all	Copy all extra markers.  This setting preserves
 			miscellaneous markers found in the source file, such
 			as JFIF thumbnails, Exif data, and Photoshop settings.

From 0f0fd75125bad97bafb3661d17a21049c75772fc Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 7 Feb 2012 23:27:14 +0000
Subject: [PATCH 04/26] Compiler warnings

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.1.x@784 632fc199-4ca6-4c93-a231-07263d6284db
---
 jcdctmgr.c        | 2 +-
 simd/jsimd_i386.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/jcdctmgr.c b/jcdctmgr.c
index 711f9dab..12f88725 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -182,7 +182,7 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
     /* fq will be one bit too large to fit in DCTELEM, so adjust */
     fq >>= 1;
     r--;
-  } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+  } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
     c++;
   } else { /* fractional part is > 0.5 */
     fq++;
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index d9bb7743..021bcb25 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -41,7 +41,7 @@ init_simd (void)
 {
   char *env = NULL;
 
-  if (simd_support != ~0)
+  if (simd_support != ~0U)
     return;
 
   simd_support = jpeg_simd_cpu_support();

From be6d424626a91cc909b06b57133721a3804cfdb7 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Tue, 7 Feb 2012 23:41:10 +0000
Subject: [PATCH 05/26] Compiler warning

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.1.x@785 632fc199-4ca6-4c93-a231-07263d6284db
---
 jdhuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jdhuff.c b/jdhuff.c
index b795462f..12db1242 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -784,7 +784,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     usefast = 0;
   }
 
-  if (cinfo->src->bytes_in_buffer < BUFSIZE * cinfo->blocks_in_MCU
+  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU
     || cinfo->unread_marker != 0)
     usefast = 0;
 

From f832eae5c2f4837e59c9450ef1d8587884994ea7 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 8 Feb 2012 09:56:04 +0000
Subject: [PATCH 06/26] Merge description from README-turbo.txt into packages

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.1.x@788 632fc199-4ca6-4c93-a231-07263d6284db
---
 release/Description.plist.in  |  2 +-
 release/ReadMe.rtf            |  6 +++---
 release/deb-control.tmpl      | 18 ++++++++----------
 release/libjpeg-turbo.spec.in | 23 +++++++++++------------
 release/pkginfo.in            |  2 +-
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/release/Description.plist.in b/release/Description.plist.in
index adca3ece..46a5bceb 100644
--- a/release/Description.plist.in
+++ b/release/Description.plist.in
@@ -3,7 +3,7 @@
 <plist version="1.0">
 <dict>
 	<key>IFPkgDescriptionDescription</key>
-	<string>A SIMD-accelerated JPEG codec which provides both the libjpeg and TurboJPEG APIs</string>
+	<string>A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs</string>
 	<key>IFPkgDescriptionTitle</key>
 	<string>@PACKAGE_NAME@</string>
 	<key>IFPkgDescriptionVersion</key>
diff --git a/release/ReadMe.rtf b/release/ReadMe.rtf
index 6d344c80..06c94abe 100644
--- a/release/ReadMe.rtf
+++ b/release/ReadMe.rtf
@@ -1,13 +1,13 @@
-{\rtf1\ansi\ansicpg1252\cocoartf1038\cocoasubrtf350
+{\rtf1\ansi\ansicpg1252\cocoartf1038\cocoasubrtf360
 {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 {\colortbl;\red255\green255\blue255;}
 \margl1440\margr1440\vieww15200\viewh9600\viewkind0
 \deftab720
 \pard\pardeftab720\ql\qnatural
 
-\f0\fs24 \cf0 libjpeg-turbo is a derivative of libjpeg which uses SIMD instructions (MMX, SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as the unmodified version of libjpeg, all else being equal.  libjpeg-turbo also includes a wrapper library which implements the TurboJPEG API used by VirtualGL and TurboVNC.\
+\f0\fs24 \cf0 libjpeg-turbo is a derivative of libjpeg that uses SIMD instructions (MMX, SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as the unmodified version of libjpeg, all else being equal.\
 \
-libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but the TigerVNC and VirtualGL projects made numerous enhancements to the codec in 2009, including improved support for Mac OS X, 64-bit support, support for 32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman encoding/decoding, and various bug fixes.  The goal was to produce a fully open source codec that could replace the partially closed source TurboJPEG/IPP codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.\
+libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but the TigerVNC and VirtualGL projects made numerous enhancements to the codec in 2009, including improved support for Mac OS X, 64-bit support, support for 32-bit and big-endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman encoding/decoding, and various bug fixes.  The goal was to produce a fully open-source codec that could replace the partially closed-source TurboJPEG/IPP codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally achieves 80-120% of the performance of TurboJPEG/IPP.  It is faster in some areas but slower in others.\
 \
 In early 2010, libjpeg-turbo spun off into its own independent project, with the goal of making high-speed JPEG compression/decompression technology available to a broader range of users and developers.\
 }
\ No newline at end of file
diff --git a/release/deb-control.tmpl b/release/deb-control.tmpl
index 5a7377ec..9cf7c46e 100644
--- a/release/deb-control.tmpl
+++ b/release/deb-control.tmpl
@@ -5,22 +5,20 @@ Priority: optional
 Architecture: {__ARCH}
 Essential: no
 Maintainer: The libjpeg-turbo Project [http://www.libjpeg-turbo.org]
-Description: A SIMD-accelerated JPEG codec which provides both the libjpeg and TurboJPEG APIs
- libjpeg-turbo is a derivative of libjpeg which uses SIMD instructions (MMX,
- SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86
- and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast
- as the unmodified version of libjpeg, all else being equal.  libjpeg-turbo
- also includes a wrapper library which implements the TurboJPEG API used by
- VirtualGL and TurboVNC.
+Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
+ libjpeg-turbo is a derivative of libjpeg that uses SIMD instructions (MMX,
+ SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86,
+ x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as
+ fast as the unmodified version of libjpeg, all else being equal.
  .
  libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
  the TigerVNC and VirtualGL projects made numerous enhancements to the codec
  in 2009, including improved support for Mac OS X, 64-bit support, support
- for 32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated
+ for 32-bit and big-endian pixel formats (RGBX, XBGR, etc.), accelerated
  Huffman encoding/decoding, and various bug fixes.  The goal was to produce a
- fully open source codec that could replace the partially closed source
+ fully open-source codec that could replace the partially closed-source
  TurboJPEG/IPP codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally
- performs in the range of 80-120% of TurboJPEG/IPP.  It is faster in some
+ achieves 80-120% of the performance of TurboJPEG/IPP.  It is faster in some
  areas but slower in others.
  .
  In early 2010, libjpeg-turbo spun off into its own independent project, with
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 645be70f..fdb3e980 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -4,7 +4,7 @@
 %define __lib lib
 %endif
 
-Summary: A SIMD-accelerated JPEG codec which provides both the libjpeg and TurboJPEG APIs
+Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
 Name: @PACKAGE_NAME@
 Version: @VERSION@
 Vendor: The libjpeg-turbo Project
@@ -19,21 +19,20 @@ Provides: %{name} = %{version}-%{release}, turbojpeg = 2.00
 Obsoletes: turbojpeg
 
 %description
-libjpeg-turbo is a derivative of libjpeg which uses SIMD instructions (MMX,
-SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86
-and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast
-as the unmodified version of libjpeg, all else being equal.  libjpeg-turbo also
-includes a wrapper library which implements the TurboJPEG API used by VirtualGL
-and TurboVNC.
+libjpeg-turbo is a derivative of libjpeg that uses SIMD instructions (MMX,
+SSE2, NEON) to accelerate baseline JPEG compression and decompression on x86,
+x86-64, and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as
+fast as the unmodified version of libjpeg, all else being equal.
 
 libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
 the TigerVNC and VirtualGL projects made numerous enhancements to the codec in
 2009, including improved support for Mac OS X, 64-bit support, support for
-32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
-encoding/decoding, and various bug fixes.  The goal was to produce a fully open
-source codec that could replace the partially closed source TurboJPEG/IPP codec
-used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range
-of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.
+32-bit and big-endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
+encoding/decoding, and various bug fixes.  The goal was to produce a fully
+open-source codec that could replace the partially closed-source TurboJPEG/IPP
+codec used by VirtualGL and TurboVNC.  libjpeg-turbo generally achieves 80-120%
+of the performance of TurboJPEG/IPP.  It is faster in some areas but slower in
+others.
 
 In early 2010, libjpeg-turbo spun off into its own independent project, with
 the goal of making high-speed JPEG compression/decompression technology
diff --git a/release/pkginfo.in b/release/pkginfo.in
index 7801dfb7..772ad0f6 100644
--- a/release/pkginfo.in
+++ b/release/pkginfo.in
@@ -3,7 +3,7 @@ PKG=@PACKAGE_NAME@
 NAME=@PACKAGE_NAME@ SDK and run time libraries
 VERSION=@VERSION@,REV=@BUILD@
 SUNW_PKGVERS=1.0
-DESC=A SIMD-accelerated JPEG codec which provides both the libjpeg and TurboJPEG APIs
+DESC=A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
 VENDOR=The libjpeg-turbo Project
 HOTLINE=http://www.libjpeg-turbo.org
 EMAIL=information@libjpeg-turbo.org

From a1647c84128ecd840024a8579cd672c4556d9ff7 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 00:39:05 +0000
Subject: [PATCH 07/26] Install docs when doing 'make install' on Unix; Fix
 'install' target on Windows; Include wizard.txt, example.c, and structure.txt
 in installed docs

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@790 632fc199-4ca6-4c93-a231-07263d6284db
---
 CMakeLists.txt                |  7 ++++---
 Makefile.am                   | 12 +++++++++---
 release/libjpeg-turbo.nsi.in  |  6 ++++++
 release/libjpeg-turbo.spec.in |  7 +++++--
 release/makecygwinpkg.in      | 10 +++-------
 release/makedpkg.in           | 10 ++++------
 release/makemacpkg.in         | 10 +++-------
 release/makesunpkg.in         | 13 ++++++++-----
 8 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a166730a..d95c9e68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -419,9 +419,10 @@ install(TARGETS jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom tjbench
   RUNTIME DESTINATION bin
 )
 
-install(FILES ${CMAKE_SOURCE_DIR}/LGPL.txt ${CMAKE_SOURCE_DIR}/LICENSE.txt
-  ${CMAKE_SOURCE_DIR}/README ${CMAKE_SOURCE_DIR}/README-turbo.txt
-  ${CMAKE_SOURCE_DIR}/libjpeg.txt ${CMAKE_SOURCE_DIR}/usage.txt
+install(FILES ${CMAKE_SOURCE_DIR}/README ${CMAKE_SOURCE_DIR}/README-turbo.txt
+  ${CMAKE_SOURCE_DIR}/example.c ${CMAKE_SOURCE_DIR}/libjpeg.txt 
+  ${CMAKE_SOURCE_DIR}/structure.txt ${CMAKE_SOURCE_DIR}/usage.txt
+  ${CMAKE_SOURCE_DIR}/wizard.txt
   DESTINATION doc)
 
 install(FILES ${CMAKE_BINARY_DIR}/jconfig.h ${CMAKE_SOURCE_DIR}/jerror.h
diff --git a/Makefile.am b/Makefile.am
index e8b25702..791fdd2e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -120,9 +120,15 @@ jcstest_LDADD = libjpeg.la
 
 dist_man1_MANS = cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
 
-DOCS= README install.txt usage.txt wizard.txt example.c libjpeg.txt \
-	structure.txt coderules.txt filelist.txt jconfig.txt change.log \
-	README-turbo.txt rdrle.c wrrle.c BUILDING.txt ChangeLog.txt
+DOCS= install.txt coderules.txt filelist.txt jconfig.txt change.log \
+	rdrle.c wrrle.c BUILDING.txt ChangeLog.txt
+
+docdir = $(datadir)/doc
+doc_DATA = README README-turbo.txt libjpeg.txt structure.txt usage.txt \
+	wizard.txt 
+
+exampledir = $(datadir)/doc
+example_DATA = example.c
 
 
 EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
diff --git a/release/libjpeg-turbo.nsi.in b/release/libjpeg-turbo.nsi.in
index 39773f6a..f4643fd1 100755
--- a/release/libjpeg-turbo.nsi.in
+++ b/release/libjpeg-turbo.nsi.in
@@ -76,8 +76,11 @@ Section "@CMAKE_PROJECT_NAME@ SDK for @INST_PLATFORM@ (required)"
 	SetOutPath $INSTDIR\doc
 	File "@CMAKE_SOURCE_DIR@\README"
 	File "@CMAKE_SOURCE_DIR@\README-turbo.txt"
+	File "@CMAKE_SOURCE_DIR@\example.c"
 	File "@CMAKE_SOURCE_DIR@\libjpeg.txt"
+	File "@CMAKE_SOURCE_DIR@\structure.txt"
 	File "@CMAKE_SOURCE_DIR@\usage.txt"
+	File "@CMAKE_SOURCE_DIR@\wizard.txt"
 
 	WriteRegStr HKLM "SOFTWARE\@INST_DIR@ @VERSION@" "Install_Dir" "$INSTDIR"
 
@@ -132,8 +135,11 @@ Section "Uninstall"
 	Delete $INSTDIR\uninstall_@VERSION@.exe
 	Delete $INSTDIR\doc\README
 	Delete $INSTDIR\doc\README-turbo.txt
+	Delete $INSTDIR\doc\example.c
 	Delete $INSTDIR\doc\libjpeg.txt
+	Delete $INSTDIR\doc\structure.txt
 	Delete $INSTDIR\doc\usage.txt
+	Delete $INSTDIR\doc\wizard.txt
 
 	RMDir "$INSTDIR\include"
 	RMDir "$INSTDIR\lib"
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
index 5d40fede..9e0ff883 100644
--- a/release/libjpeg-turbo.spec.in
+++ b/release/libjpeg-turbo.spec.in
@@ -48,7 +48,7 @@ available to a broader range of users and developers.
 %install
 
 rm -rf $RPM_BUILD_ROOT
-make install DESTDIR=$RPM_BUILD_ROOT libdir=/opt/%{name}/%{__lib} mandir=/opt/%{name}/man
+make install DESTDIR=$RPM_BUILD_ROOT libdir=/opt/%{name}/%{__lib} mandir=/opt/%{name}/man docdir=/opt/%{name}/doc exampledir=/opt/%{name}/doc
 rm -f $RPM_BUILD_ROOT/opt/%{name}/%{__lib}/*.la
 mkdir -p $RPM_BUILD_ROOT/usr/%{__lib}
 mv $RPM_BUILD_ROOT/opt/%{name}/%{__lib}/libturbojpeg.* $RPM_BUILD_ROOT/usr/%{__lib}
@@ -58,6 +58,8 @@ mkdir -p $RPM_BUILD_ROOT/usr/include
 mv $RPM_BUILD_ROOT/opt/%{name}/include/turbojpeg.h $RPM_BUILD_ROOT/usr/include
 ln -fs /usr/include/turbojpeg.h $RPM_BUILD_ROOT/opt/%{name}/include/
 ln -fs /usr/%{__lib}/libturbojpeg.a $RPM_BUILD_ROOT/opt/%{name}/%{__lib}/
+mkdir -p $RPM_BUILD_ROOT%{_defaultdocdir}
+mv $RPM_BUILD_ROOT/opt/%{name}/doc $RPM_BUILD_ROOT%{_defaultdocdir}/%{name}-%{version}
 
 %post -p /sbin/ldconfig
 
@@ -68,7 +70,8 @@ rm -rf $RPM_BUILD_ROOT
 
 %files
 %defattr(-,root,root)
-%doc %{_srcdir}/README-turbo.txt %{_srcdir}/README %{_srcdir}/libjpeg.txt %{_srcdir}/usage.txt
+%dir %{_defaultdocdir}/%{name}-%{version}
+%doc %{_defaultdocdir}/%{name}-%{version}/*
 %dir /opt/%{name}
 %dir /opt/%{name}/bin
 /opt/%{name}/bin/cjpeg
diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
index 2dbd650c..5fac864c 100755
--- a/release/makecygwinpkg.in
+++ b/release/makecygwinpkg.in
@@ -23,14 +23,10 @@ umask 022
 rm -f $PACKAGE_NAME-$VERSION-cygwin.tar.bz2
 TMPDIR=`mktemp -d /tmp/ljtbuild.XXXXXX`
 __PWD=`pwd`
-make install DESTDIR=$TMPDIR/pkg mandir=/opt/$PACKAGE_NAME/man
+make install DESTDIR=$TMPDIR/pkg mandir=/opt/$PACKAGE_NAME/man \
+	docdir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
+	exampledir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
 rm $TMPDIR/pkg/opt/$PACKAGE_NAME/lib/*.la
-DOCDIR=$TMPDIR/pkg/usr/share/doc/$PACKAGE_NAME-$VERSION
-mkdir -p $DOCDIR
-install -m 644 $SRCDIR/README-turbo.txt $DOCDIR
-install -m 644 $SRCDIR/README $DOCDIR
-install -m 644 $SRCDIR/libjpeg.txt $DOCDIR
-install -m 644 $SRCDIR/usage.txt $DOCDIR
 ln -fs lib $TMPDIR/pkg/opt/$PACKAGE_NAME/lib32
 cd $TMPDIR/pkg
 tar cfj ../$PACKAGE_NAME-$VERSION-cygwin.tar.bz2 *
diff --git a/release/makedpkg.in b/release/makedpkg.in
index 7a15518d..936151b6 100644
--- a/release/makedpkg.in
+++ b/release/makedpkg.in
@@ -36,7 +36,10 @@ makedeb()
 		| sed s/{__VERSION}/$VERSION/g | sed s/{__BUILD}/$BUILD/g \
 		| sed s/{__ARCH}/$DEBARCH/g > $TMPDIR/DEBIAN/control)
 
-	make install prefix=$TMPDIR/opt/$DIRNAME libdir=$TMPDIR/opt/$DIRNAME/$__LIB mandir=$TMPDIR/opt/$DIRNAME/man
+	make install prefix=$TMPDIR/opt/$DIRNAME libdir=$TMPDIR/opt/$DIRNAME/$__LIB \
+		mandir=$TMPDIR/opt/$DIRNAME/man \
+		docdir=$TMPDIR/usr/share/doc/$DIRNAME-$VERSION \
+		exampledir=$TMPDIR/usr/share/doc/$DIRNAME-$VERSION
 	rm -f $TMPDIR/opt/$DIRNAME/$__LIB/*.la
 
 	if [ $SUPPLEMENT = 1 ]; then
@@ -63,11 +66,6 @@ makedeb()
 		mkdir -p $TMPDIR/usr/include
 		mv $TMPDIR/opt/$DIRNAME/include/turbojpeg.h $TMPDIR/usr/include
 		ln -fs /usr/include/turbojpeg.h $TMPDIR/opt/$DIRNAME/include/
-		mkdir -p $TMPDIR/usr/share/doc/$DIRNAME-$VERSION
-		install -m 644 $SRCDIR/README-turbo.txt $TMPDIR/usr/share/doc/$DIRNAME-$VERSION
-		install -m 644 $SRCDIR/README $TMPDIR/usr/share/doc/$DIRNAME-$VERSION
-		install -m 644 $SRCDIR/libjpeg.txt $TMPDIR/usr/share/doc/$DIRNAME-$VERSION
-		install -m 644 $SRCDIR/usage.txt $TMPDIR/usr/share/doc/$DIRNAME-$VERSION
 	fi
 
 	sudo chown -Rh root:root $TMPDIR/*
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 2ee08053..b0f45ab9 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -46,7 +46,9 @@ TMPDIR=`mktemp -d /tmp/$PACKAGE_NAME-build.XXXXXX`
 PKGROOT=$TMPDIR/pkg/Package_Root
 mkdir -p $PKGROOT
 mkdir -p $PKGROOT/opt/$PACKAGE_NAME/bin
-make install DESTDIR=$PKGROOT mandir=/opt/$PACKAGE_NAME/man
+make install DESTDIR=$PKGROOT mandir=/opt/$PACKAGE_NAME/man \
+	docdir=/Library/Documentation/$PACKAGE_NAME \
+	exampledir=/Library/Documentation/$PACKAGE_NAME
 rm -f $PKGROOT/opt/$PACKAGE_NAME/lib/*.la
 mkdir -p $PKGROOT/usr/lib
 mv $PKGROOT/opt/$PACKAGE_NAME/lib/libturbojpeg.* $PKGROOT/usr/lib
@@ -133,7 +135,6 @@ if [ ! -h $PKGROOT/opt/$PACKAGE_NAME/lib64 ]; then
 	ln -fs lib $PKGROOT/opt/$PACKAGE_NAME/lib64
 fi
 
-mkdir -p $PKGROOT/Library/Documentation/$PACKAGE_NAME
 chmod 1775 $PKGROOT/Library
 chmod 775 $PKGROOT/Library/Documentation 
 mkdir -p $TMPDIR/pkg/Resources 
@@ -142,11 +143,6 @@ cp pkgscripts/Description.plist $TMPDIR/pkg/
 cp pkgscripts/Info.plist $TMPDIR/pkg/
 install -m 755 pkgscripts/uninstall $PKGROOT/opt/$PACKAGE_NAME/bin/
 
-install -m 644 $SRCDIR/README-turbo.txt $PKGROOT/Library/Documentation/$PACKAGE_NAME/README-turbo.txt 
-install -m 644 $SRCDIR/README $PKGROOT/Library/Documentation/$PACKAGE_NAME/README 
-install -m 644 $SRCDIR/libjpeg.txt $PKGROOT/Library/Documentation/$PACKAGE_NAME/libjpeg.txt 
-install -m 644 $SRCDIR/usage.txt $PKGROOT/Library/Documentation/$PACKAGE_NAME/usage.txt 
-
 sudo chown -R root:admin $PKGROOT 
 sudo chown -R root:0 $PKGROOT/usr 
 cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadMe.rtf $TMPDIR/pkg/Resources/ 
diff --git a/release/makesunpkg.in b/release/makesunpkg.in
index 3331dab0..7cf36a88 100644
--- a/release/makesunpkg.in
+++ b/release/makesunpkg.in
@@ -65,11 +65,11 @@ if [ $COMBINED = 1 ]; then
 	cd $PWD
 fi
 # This mess is to work around a bug in /usr/ccs/bin/make
-make install DESTDIR=$TMPDIR libdir=/opt/$PACKAGE_NAME/$__LIB mandir=/opt/$PACKAGE_NAME/man AM_MAKEFLAGS="libdir=/opt/$PACKAGE_NAME/$__LIB mandir=/opt/$PACKAGE_NAME/man"
+make install DESTDIR=$TMPDIR libdir=/opt/$PACKAGE_NAME/$__LIB \
+	mandir=/opt/$PACKAGE_NAME/man docdir=/opt/$PACKAGE_NAME/doc \
+	exampledir=/opt/$PACKAGE_NAME/doc \
+	AM_MAKEFLAGS="libdir=/opt/$PACKAGE_NAME/$__LIB mandir=/opt/$PACKAGE_NAME/man docdir=/opt/$PACKAGE_NAME/doc exampledir=/opt/$PACKAGE_NAME/doc"
 rm -f $TMPDIR/opt/$PACKAGE_NAME/$__LIB/*.la
-mkdir -p $TMPDIR/opt/$PACKAGE_NAME/doc
-cp $SRCDIR/README-turbo.txt $SRCDIR/README $SRCDIR/libjpeg.txt $SRCDIR/usage.txt $TMPDIR/opt/$PACKAGE_NAME/doc
-chmod 644 $TMPDIR/opt/$PACKAGE_NAME/doc/*
 
 cat >$TMPDIR/proto <<EOF
 i copyright
@@ -122,10 +122,13 @@ f none $PACKAGE_NAME/include/jmorecfg.h 0644 root bin
 f none $PACKAGE_NAME/include/jpeglib.h 0644 root bin
 f none $PACKAGE_NAME/include/turbojpeg.h 0644 root bin
 d none $PACKAGE_NAME/doc 0755 root bin
-f none $PACKAGE_NAME/doc/libjpeg.txt 0644 root bin
 f none $PACKAGE_NAME/doc/README 0644 root bin
 f none $PACKAGE_NAME/doc/README-turbo.txt 0644 root bin
+f none $PACKAGE_NAME/doc/example.c 0644 root bin
+f none $PACKAGE_NAME/doc/libjpeg.txt 0644 root bin
+f none $PACKAGE_NAME/doc/structure.txt 0644 root bin
 f none $PACKAGE_NAME/doc/usage.txt 0644 root bin
+f none $PACKAGE_NAME/doc/wizard.txt 0644 root bin
 EOF
 if [ $WITH_JAVA = 1 ]; then
 echo d none $PACKAGE_NAME/classes 0755 root bin >>$TMPDIR/proto

From 20b734e1c4a84054c5cbdbd8f6ac0e75d39545e2 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 01:30:37 +0000
Subject: [PATCH 08/26] Infrastructure for producing a universal
 x86-64/i386/ARM version of libjpeg.a and libturbojpeg.a on OS X, so that the
 same library can be used to build OS X and iOS apps

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@793 632fc199-4ca6-4c93-a231-07263d6284db
---
 BUILDING.txt          | 33 +++++++++++++++-----
 ChangeLog.txt         |  5 +++
 Makefile.am           | 14 ++++++++-
 release/makemacpkg.in | 71 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index 0b3e055f..fc583ee3 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -644,13 +644,32 @@ make dmg
 
 make udmg [BUILDDIR32={32-bit build directory}]
 
-  On 64-bit OS X systems, this creates a version of the Macintosh package and
-  disk image that contains universal i386/x86-64 binaries.  You should first
-  configure a 32-bit out-of-tree build of libjpeg-turbo, then configure a
-  64-bit out-of-tree build, then run 'make udmg' from the 64-bit build
-  directory.  The build system will look for the 32-bit build under
-  {source_directory}/osxx86 by default, but you can override this by setting
-  the BUILDDIR32 variable on the make command line as shown above.
+  On 64-bit OS X systems, this creates a Macintosh package and disk image that
+  contains universal i386/x86-64 binaries.  You should first configure a 32-bit
+  out-of-tree build of libjpeg-turbo, then configure a 64-bit out-of-tree
+  build, then run 'make udmg' from the 64-bit build directory.  The build
+  system will look for the 32-bit build under {source_directory}/osxx86 by
+  default, but you can override this by setting the BUILDDIR32 variable on the
+  make command line as shown above.
+
+make iosdmg [BUILDDIR32={32-bit build directory}] \
+  [BUILDDIRARMV6={ARM v6 build directory}] \
+  [BUILDDIRARMV7={ARM v7 build directory}] \
+
+  On OS X systems, this creates a Macintosh package and disk image in which the
+  libjpeg-turbo static libraries contain ARM architectures necessary to build
+  iOS applications.  If building on an x86-64 system, the binaries will also
+  contain the i386 architecture, as with 'make udmg' above.  You should first
+  configure ARM v6 and ARM v7 out-of-tree builds of libjpeg-turbo (see
+  "Building libjpeg-turbo for iOS" above.)  If you are building an x86-64
+  version of libjpeg-turbo, you should configure a 32-bit out-of-tree build as
+  well.  Next, build libjpeg-turbo as you would normally, using an out-of-tree
+  build.  When it is built, run 'make iosdmg' from the build directory.  The
+  build system will look for the ARM v6 build under {source_directory}/iosarmv6
+  by default, the ARM v7 build under {source_directory}/iosarmv7 by default,
+  and (if applicable) the 32-bit build under {source_directory}/osxx86 by
+  default, but you can override this by setting the BUILDDIR32, BUILDDIRARMV6,
+  and/or BUILDDIRARMV7 variables on the make command line as shown above.
 
 make sunpkg
 
diff --git a/ChangeLog.txt b/ChangeLog.txt
index f6b38d7b..529bd7e9 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -29,6 +29,11 @@ K component is assigned a component ID of 1 instead of 4.  Although these files
 are in violation of the spec, other JPEG implementations handle them
 correctly.
 
+[7] Added ARM v6 and ARM v7 architectures to libjpeg.a and libturbojpeg.a in
+the official OS X distribution package, so that those libraries can be used to
+build both OS X and iOS applications.
+
+
 
 1.1.90 (1.2 beta1)
 ==================
diff --git a/Makefile.am b/Makefile.am
index 791fdd2e..ffd5bb2f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -299,10 +299,22 @@ srpm: dist-gzip
 deb: all
 	sh pkgscripts/makedpkg
 
+BUILDDIR32=@abs_top_srcdir@/osxx86
+BUILDDIRARMV6=@abs_top_srcdir@/iosarmv6
+BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
+
 if X86_64
 
 udmg: all
-	sh pkgscripts/makemacpkg universal ${BUILDDIR32}
+	sh pkgscripts/makemacpkg -builddir32 ${BUILDDIR32}
+
+iosdmg: all
+	sh pkgscripts/makemacpkg -builddir32 ${BUILDDIR32} -builddirarmv6 ${BUILDDIRARMV6} -builddirarmv7 ${BUILDDIRARMV7}
+
+else
+
+iosdmg: all
+	sh pkgscripts/makemacpkg -builddirarmv6 ${BUILDDIRARMV6} -builddirarmv7 ${BUILDDIRARMV7}
 
 endif
 
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index b0f45ab9..89a3fd7d 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -17,24 +17,27 @@ onexit()
 
 usage()
 {
-	echo "$0 [universal [32-bit build dir]]"
+	echo "$0 [-builddir32 [32-bit build dir]] [-builddirarmv6 [ARM v6 build dir]] [-builddirarmv7 [ARM v7 build dir]]"
 	exit 1
 }
 
-UNIVERSAL=0
-
 PACKAGE_NAME=@PACKAGE_NAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
 SRCDIR=@abs_top_srcdir@
-BUILDDIR32=@abs_top_srcdir@/osxx86
+BUILDDIR32=
+BUILDDIRARMV6=
+BUILDDIRARMV7=
 WITH_JAVA=@WITH_JAVA@
-if [ $# -gt 0 ]; then
-	if [ "$1" = "universal" ]; then
-		UNIVERSAL=1
-		if [ $# -gt 1 ]; then BUILDDIR32=$2; fi
-	fi
-fi
+while [ $# -gt 0 ]; do
+	case $1 in
+	-h*)             usage 0                   ;;
+	-builddir32)     BUILDDIR32=$2;     shift  ;;
+	-builddirarmv6)  BUILDDIRARMV6=$2;  shift  ;;
+	-builddirarmv7)  BUILDDIRARMV7=$2;  shift  ;;
+	esac
+	shift
+done
 PACKAGEMAKER=/Developer/Applications/Utilities/PackageMaker.app/Contents/MacOS/PackageMaker
 
 if [ -f $PACKAGE_NAME-$VERSION.dmg ]; then
@@ -55,7 +58,7 @@ mv $PKGROOT/opt/$PACKAGE_NAME/lib/libturbojpeg.* $PKGROOT/usr/lib
 mkdir -p $PKGROOT/usr/include
 mv $PKGROOT/opt/$PACKAGE_NAME/include/turbojpeg.h $PKGROOT/usr/include
 
-if [ $UNIVERSAL = 1 ]; then
+if [ ! "$BUILDDIR32" = "" ]; then
 	if [ ! -d $BUILDDIR32 ]; then
 		echo ERROR: 32-bit build directory $BUILDDIR32 does not exist
 		exit 1
@@ -120,6 +123,52 @@ if [ $UNIVERSAL = 1 ]; then
 
 fi
 
+if [ ! "$BUILDDIRARMV6" = "" ]; then
+	if [ ! -d $BUILDDIRARMV6 ]; then
+		echo ERROR: ARM v6 build directory $BUILDDIRARMV6 does not exist
+		exit 1
+	fi
+	if [ ! -f $BUILDDIRARMV6/Makefile ]; then
+		echo ERROR: ARM v6 build directory $BUILDDIRARMV6 is not configured
+		exit 1
+	fi
+	mkdir -p $TMPDIR/dist.armv6
+	pushd $BUILDDIRARMV6
+	make install DESTDIR=$TMPDIR/dist.armv6
+	popd
+	lipo -create \
+		$PKGROOT/opt/$PACKAGE_NAME/lib/libjpeg.a \
+		-arch arm $TMPDIR/dist.armv6/opt/$PACKAGE_NAME/lib/libjpeg.a \
+		-output $PKGROOT/opt/$PACKAGE_NAME/lib/libjpeg.a
+	lipo -create \
+		$PKGROOT/usr/lib/libturbojpeg.a \
+		-arch arm $TMPDIR/dist.armv6/opt/$PACKAGE_NAME/lib/libturbojpeg.a \
+		-output $PKGROOT/usr/lib/libturbojpeg.a
+fi
+
+if [ ! "$BUILDDIRARMV7" = "" ]; then
+	if [ ! -d $BUILDDIRARMV7 ]; then
+		echo ERROR: ARM v7 build directory $BUILDDIRARMV7 does not exist
+		exit 1
+	fi
+	if [ ! -f $BUILDDIRARMV7/Makefile ]; then
+		echo ERROR: ARM v7 build directory $BUILDDIRARMV7 is not configured
+		exit 1
+	fi
+	mkdir -p $TMPDIR/dist.armv7
+	pushd $BUILDDIRARMV7
+	make install DESTDIR=$TMPDIR/dist.armv7
+	popd
+	lipo -create \
+		$PKGROOT/opt/$PACKAGE_NAME/lib/libjpeg.a \
+		-arch arm $TMPDIR/dist.armv7/opt/$PACKAGE_NAME/lib/libjpeg.a \
+		-output $PKGROOT/opt/$PACKAGE_NAME/lib/libjpeg.a
+	lipo -create \
+		$PKGROOT/usr/lib/libturbojpeg.a \
+		-arch arm $TMPDIR/dist.armv7/opt/$PACKAGE_NAME/lib/libturbojpeg.a \
+		-output $PKGROOT/usr/lib/libturbojpeg.a
+fi
+
 install_name_tool -id /opt/$PACKAGE_NAME/lib/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/opt/$PACKAGE_NAME/lib/libjpeg.@SO_MAJOR_VERSION@.dylib
 install_name_tool -id libturbojpeg.dylib $PKGROOT/usr/lib/libturbojpeg.dylib
 

From 57bd84f4355131ac9762f78b71f80264b94a4654 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 01:40:29 +0000
Subject: [PATCH 09/26] RPM spec no longer uses %{_srcdir}

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@796 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index ffd5bb2f..8d8bc3a2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -273,7 +273,7 @@ rpm: all
 	ln -fs `pwd` $$TMPDIR/BUILD; \
 	rm -f ${PACKAGE_NAME}-${VERSION}.${RPMARCH}.rpm; \
 	rpmbuild -bb --define "_blddir $$TMPDIR/buildroot"  \
-		--define "_topdir $$TMPDIR" --define "_srcdir ${srcdir}" \
+		--define "_topdir $$TMPDIR" \
 		--target ${RPMARCH} pkgscripts/libjpeg-turbo.spec; \
 	cp $$TMPDIR/RPMS/${RPMARCH}/${PACKAGE_NAME}-${VERSION}-${BUILD}.${RPMARCH}.rpm \
 		${PACKAGE_NAME}-${VERSION}.${RPMARCH}.rpm; \
@@ -289,7 +289,7 @@ srpm: dist-gzip
 	rm -f ${PACKAGE_NAME}-${VERSION}.src.rpm; \
 	cp ${PACKAGE_NAME}-${VERSION}.tar.gz $$TMPDIR/SOURCES; \
 	cat pkgscripts/libjpeg-turbo.spec | sed s/%{_blddir}/%{_tmppath}/g \
-		| sed s@%{_srcdir}/@@g | sed s/#--\>//g \
+		| sed s/#--\>//g \
 		> $$TMPDIR/SPECS/libjpeg-turbo.spec; \
 	rpmbuild -bs --define "_topdir $$TMPDIR" $$TMPDIR/SPECS/libjpeg-turbo.spec; \
 	cp $$TMPDIR/SRPMS/${PACKAGE_NAME}-${VERSION}-${BUILD}.src.rpm \

From ef49f790bdebd25ff84d2909b757967352f54f98 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 01:44:23 +0000
Subject: [PATCH 10/26] git-svn-id:
 svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@797
 632fc199-4ca6-4c93-a231-07263d6284db

---
 ChangeLog.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 529bd7e9..7cd5b59b 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -34,7 +34,6 @@ the official OS X distribution package, so that those libraries can be used to
 build both OS X and iOS applications.
 
 
-
 1.1.90 (1.2 beta1)
 ==================
 

From 01bf9d9ccd5afd18ceffa7dcefe543d282742343 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 01:52:31 +0000
Subject: [PATCH 11/26] Fix 'make dist'

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@799 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 8d8bc3a2..b21acc24 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,11 +124,11 @@ DOCS= install.txt coderules.txt filelist.txt jconfig.txt change.log \
 	rdrle.c wrrle.c BUILDING.txt ChangeLog.txt
 
 docdir = $(datadir)/doc
-doc_DATA = README README-turbo.txt libjpeg.txt structure.txt usage.txt \
+dist_doc_DATA = README README-turbo.txt libjpeg.txt structure.txt usage.txt \
 	wizard.txt 
 
 exampledir = $(datadir)/doc
-example_DATA = example.c
+dist_example_DATA = example.c
 
 
 EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \

From 51d626ff9d32dc67294575460c5401e763ae8ca9 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 02:51:40 +0000
Subject: [PATCH 12/26] Don't include documentation in 32-bit supplemental
 package

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@801 632fc199-4ca6-4c93-a231-07263d6284db
---
 release/makedpkg.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/release/makedpkg.in b/release/makedpkg.in
index 936151b6..6d081bb6 100644
--- a/release/makedpkg.in
+++ b/release/makedpkg.in
@@ -47,6 +47,7 @@ makedeb()
 		rm -rf $TMPDIR/opt/$DIRNAME/man
 		rm -rf $TMPDIR/opt/$DIRNAME/bin
 		rm -rf $TMPDIR/opt/$DIRNAME/classes
+		rm -rf $TMPDIR/usr
 	fi
 
 	mkdir -p $TMPDIR/usr/$__LIB

From 9e7eb2768638bd655e17a13e90b98c84aa927443 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 03:41:20 +0000
Subject: [PATCH 13/26] Move build dir.	variables back into makemacpkg to
 avoid	messing	up the Solaris packaging system.

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@804 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am           |  4 ----
 release/makemacpkg.in | 30 ++++++++++++++++++++++++------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index b21acc24..0fef9fa5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -299,10 +299,6 @@ srpm: dist-gzip
 deb: all
 	sh pkgscripts/makedpkg
 
-BUILDDIR32=@abs_top_srcdir@/osxx86
-BUILDDIRARMV6=@abs_top_srcdir@/iosarmv6
-BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
-
 if X86_64
 
 udmg: all
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 89a3fd7d..6cb4069d 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -25,16 +25,34 @@ PACKAGE_NAME=@PACKAGE_NAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
 SRCDIR=@abs_top_srcdir@
-BUILDDIR32=
-BUILDDIRARMV6=
-BUILDDIRARMV7=
+BUILDDIR32=@abs_top_srcdir@/osxx86
+BUILDDIRARMV6=@abs_top_srcdir@/iosarmv6
+BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
 WITH_JAVA=@WITH_JAVA@
 while [ $# -gt 0 ]; do
 	case $1 in
 	-h*)             usage 0                   ;;
-	-builddir32)     BUILDDIR32=$2;     shift  ;;
-	-builddirarmv6)  BUILDDIRARMV6=$2;  shift  ;;
-	-builddirarmv7)  BUILDDIRARMV7=$2;  shift  ;;
+	-builddir32)
+		if [ $# -gt 1 ]; then
+			if [[ ! "$2" =~ -.* ]]; then
+				BUILDDIR32=$2;  shift
+			fi
+		fi
+		;;
+	-builddirarmv6)
+		if [ $# -gt 1 ]; then
+			if [[ ! "$2" =~ -.* ]]; then
+				BUILDDIRARMV6=$2;  shift
+			fi
+		fi
+		;;
+	-builddirarmv7)
+		if [ $# -gt 1 ]; then
+			if [[ ! "$2" =~ -.* ]]; then
+				BUILDDIRARMV7=$2;  shift
+			fi
+		fi
+		;;
 	esac
 	shift
 done

From 69c1408e9d5ea1db45ae537b911c60088d813ad5 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 10 Feb 2012 03:47:18 +0000
Subject: [PATCH 14/26] Oops

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@805 632fc199-4ca6-4c93-a231-07263d6284db
---
 release/makecygwinpkg.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
index 5fac864c..32cecddb 100755
--- a/release/makecygwinpkg.in
+++ b/release/makecygwinpkg.in
@@ -25,7 +25,7 @@ TMPDIR=`mktemp -d /tmp/ljtbuild.XXXXXX`
 __PWD=`pwd`
 make install DESTDIR=$TMPDIR/pkg mandir=/opt/$PACKAGE_NAME/man \
 	docdir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
-	exampledir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
+	exampledir=/usr/share/doc/$PACKAGE_NAME-$VERSION
 rm $TMPDIR/pkg/opt/$PACKAGE_NAME/lib/*.la
 ln -fs lib $TMPDIR/pkg/opt/$PACKAGE_NAME/lib32
 cd $TMPDIR/pkg

From a9b646c202a2a154a125545b4347cbfd70598c9e Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sun, 11 Mar 2012 22:06:54 +0000
Subject: [PATCH 15/26] Allow RGB JPEG files to be created/decoded when using
 the LJT colorspace extensions

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@809 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt |  8 +++++
 jccolext.c    | 34 +++++++++++++++++++-
 jccolor.c     | 88 +++++++++++++++++++++++++++++++++++++++++++--------
 jdcolext.c    | 37 ++++++++++++++++++++++
 jdcolor.c     | 70 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 219 insertions(+), 18 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 7cd5b59b..e27ac09a 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,11 @@
+1.2.1
+=====
+
+[1] Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+
 1.2.0
 =====
 
diff --git a/jccolext.c b/jccolext.c
index acbfa235..dbac84a9 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -2,7 +2,7 @@
  * jccolext.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2012, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -112,3 +112,35 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
     }
   }
 }
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * This version handles extended RGB->plain RGB conversion
+ */
+
+INLINE
+LOCAL(void)
+rgb_rgb_convert_internal (j_compress_ptr cinfo,
+                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                          JDIMENSION output_row, int num_rows)
+{
+  register JSAMPROW inptr;
+  register JSAMPROW outptr0, outptr1, outptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->image_width;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+    for (col = 0; col < num_cols; col++) {
+      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
+      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
+      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      inptr += RGB_PIXELSIZE;
+    }
+  }
+}
diff --git a/jccolor.c b/jccolor.c
index 97305557..3a0772bb 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2012, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -96,6 +96,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 #define rgb_ycc_convert_internal extrgb_ycc_convert_internal
 #define rgb_gray_convert_internal extrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -103,6 +104,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_RGBX_RED
 #define RGB_GREEN EXT_RGBX_GREEN
@@ -110,6 +112,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 #define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
 #define rgb_gray_convert_internal extrgbx_gray_convert_internal
+#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -117,6 +120,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_BGR_RED
 #define RGB_GREEN EXT_BGR_GREEN
@@ -124,6 +128,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 #define rgb_ycc_convert_internal extbgr_ycc_convert_internal
 #define rgb_gray_convert_internal extbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -131,6 +136,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_BGRX_RED
 #define RGB_GREEN EXT_BGRX_GREEN
@@ -138,6 +144,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 #define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
 #define rgb_gray_convert_internal extbgrx_gray_convert_internal
+#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -145,6 +152,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_XBGR_RED
 #define RGB_GREEN EXT_XBGR_GREEN
@@ -152,6 +160,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 #define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
 #define rgb_gray_convert_internal extxbgr_gray_convert_internal
+#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -159,6 +168,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_XRGB_RED
 #define RGB_GREEN EXT_XRGB_GREEN
@@ -166,6 +176,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 #define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
 #define rgb_gray_convert_internal extxrgb_gray_convert_internal
+#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -173,6 +184,7 @@ typedef my_color_converter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef rgb_ycc_convert_internal
 #undef rgb_gray_convert_internal
+#undef rgb_rgb_convert_internal
 
 
 /*
@@ -306,6 +318,52 @@ rgb_gray_convert (j_compress_ptr cinfo,
 }
 
 
+/*
+ * Extended RGB to plain RGB conversion
+ */
+
+METHODDEF(void)
+rgb_rgb_convert (j_compress_ptr cinfo,
+		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+		  JDIMENSION output_row, int num_rows)
+{
+  switch (cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                   num_rows);
+      break;
+    case JCS_EXT_BGR:
+      extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                   num_rows);
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                   num_rows);
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                   num_rows);
+      break;
+    default:
+      rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                               num_rows);
+      break;
+  }
+}
+
+
 /*
  * Convert some rows of samples to the JPEG colorspace.
  * This version handles Adobe-style CMYK->YCCK conversion,
@@ -523,21 +581,25 @@ jinit_color_converter (j_compress_ptr cinfo)
     break;
 
   case JCS_RGB:
-  case JCS_EXT_RGB:
-  case JCS_EXT_RGBX:
-  case JCS_EXT_BGR:
-  case JCS_EXT_BGRX:
-  case JCS_EXT_XBGR:
-  case JCS_EXT_XRGB:
-  case JCS_EXT_RGBA:
-  case JCS_EXT_BGRA:
-  case JCS_EXT_ABGR:
-  case JCS_EXT_ARGB:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == cinfo->jpeg_color_space &&
-      rgb_pixelsize[cinfo->in_color_space] == 3)
+    if (rgb_red[cinfo->in_color_space] == 0 &&
+        rgb_green[cinfo->in_color_space] == 1 &&
+        rgb_blue[cinfo->in_color_space] == 2 &&
+        rgb_pixelsize[cinfo->in_color_space] == 3)
       cconvert->pub.color_convert = null_convert;
+    else if (cinfo->in_color_space == JCS_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGBX ||
+             cinfo->in_color_space == JCS_EXT_BGR ||
+             cinfo->in_color_space == JCS_EXT_BGRX ||
+             cinfo->in_color_space == JCS_EXT_XBGR ||
+             cinfo->in_color_space == JCS_EXT_XRGB ||
+             cinfo->in_color_space == JCS_EXT_RGBA ||
+             cinfo->in_color_space == JCS_EXT_BGRA ||
+             cinfo->in_color_space == JCS_EXT_ABGR ||
+             cinfo->in_color_space == JCS_EXT_ARGB)
+      cconvert->pub.color_convert = rgb_rgb_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
diff --git a/jdcolext.c b/jdcolext.c
index 07da949f..3b8aeffc 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -102,3 +102,40 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
     }
   }
 }
+
+
+/*
+ * Convert RGB to extended RGB: just swap the order of source pixels
+ */
+
+INLINE
+LOCAL(void)
+rgb_rgb_convert_internal (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JSAMPROW outptr;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr[RGB_RED] = inptr0[col];
+      outptr[RGB_GREEN] = inptr1[col];
+      outptr[RGB_BLUE] = inptr2[col];
+      /* Set unused byte to 0xFF so it can be interpreted as an opaque */
+      /* alpha channel value */
+#ifdef RGB_ALPHA
+      outptr[RGB_ALPHA] = 0xFF;
+#endif
+      outptr += RGB_PIXELSIZE;
+    }
+  }
+}
diff --git a/jdcolor.c b/jdcolor.c
index d9268dbb..694de9b6 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011, D. R. Commander.
+ * Copyright (C) 2009, 2011-2012, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -80,6 +80,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extrgb_convert_internal
 #define gray_rgb_convert_internal gray_extrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -87,6 +88,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_RGBX_RED
 #define RGB_GREEN EXT_RGBX_GREEN
@@ -95,6 +97,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
 #define gray_rgb_convert_internal gray_extrgbx_convert_internal
+#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -103,6 +106,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_BGR_RED
 #define RGB_GREEN EXT_BGR_GREEN
@@ -110,6 +114,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extbgr_convert_internal
 #define gray_rgb_convert_internal gray_extbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -117,6 +122,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_BGRX_RED
 #define RGB_GREEN EXT_BGRX_GREEN
@@ -125,6 +131,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
 #define gray_rgb_convert_internal gray_extbgrx_convert_internal
+#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -133,6 +140,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_XBGR_RED
 #define RGB_GREEN EXT_XBGR_GREEN
@@ -141,6 +149,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
 #define gray_rgb_convert_internal gray_extxbgr_convert_internal
+#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -149,6 +158,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 #define RGB_RED EXT_XRGB_RED
 #define RGB_GREEN EXT_XRGB_GREEN
@@ -157,6 +167,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 #define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
 #define gray_rgb_convert_internal gray_extxrgb_convert_internal
+#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -165,6 +176,7 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #undef RGB_PIXELSIZE
 #undef ycc_rgb_convert_internal
 #undef gray_rgb_convert_internal
+#undef rgb_rgb_convert_internal
 
 
 /*
@@ -352,6 +364,51 @@ gray_rgb_convert (j_decompress_ptr cinfo,
 }
 
 
+/*
+ * Convert plain RGB to extended RGB
+ */
+
+METHODDEF(void)
+rgb_rgb_convert (j_decompress_ptr cinfo,
+		  JSAMPIMAGE input_buf, JDIMENSION input_row,
+		  JSAMPARRAY output_buf, int num_rows)
+{
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                   num_rows);
+      break;
+    case JCS_EXT_BGR:
+      rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                   num_rows);
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                   num_rows);
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                   num_rows);
+      break;
+    default:
+      rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                               num_rows);
+      break;
+  }
+}
+
 /*
  * Adobe-style YCCK->CMYK conversion.
  * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same
@@ -494,9 +551,14 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
       }
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
-      rgb_pixelsize[cinfo->out_color_space] == 3) {
-      cconvert->pub.color_convert = null_convert;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      if (rgb_red[cinfo->out_color_space] == 0 &&
+          rgb_green[cinfo->out_color_space] == 1 &&
+          rgb_blue[cinfo->out_color_space] == 2 &&
+          rgb_pixelsize[cinfo->out_color_space] == 3)
+        cconvert->pub.color_convert = null_convert;
+      else
+        cconvert->pub.color_convert = rgb_rgb_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;

From cac105133e75a52fa5d57a2abccf242bb7b820d0 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 16 Mar 2012 14:37:36 +0000
Subject: [PATCH 16/26] Fix the behavior of the alpha-enabled colorspace
 constants whenever libjpeg-turbo is built without SIMD support and merged
 upsampling is used.

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@811 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt        |  6 ++++++
 java/TJUnitTest.java | 31 ++++++++++++++++---------------
 jdmerge.c            |  8 ++++++++
 jdmrgext.c           | 28 ++++++++++++++++++++++++++++
 tjbench.c            |  2 +-
 tjunittest.c         | 21 ++++++++++-----------
 6 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index e27ac09a..b0d06a76 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -5,6 +5,12 @@
 properly work when the input or output colorspace is one of the libjpeg-turbo
 colorspace extensions.
 
+[2] When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
 
 1.2.0
 =====
diff --git a/java/TJUnitTest.java b/java/TJUnitTest.java
index 557a85a9..b88b28ed 100644
--- a/java/TJUnitTest.java
+++ b/java/TJUnitTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2012 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -765,6 +765,9 @@ public class TJUnitTest {
       for(int pf : formats) {
         for(int i = 0; i < 2; i++) {
           int flags = 0;
+          if (subsamp == TJ.SAMP_422 || subsamp == TJ.SAMP_420
+            || subsamp == TJ.SAMP_440)
+            flags |= TJ.FLAG_FASTUPSAMPLE;
           if(i == 1) {
             if(yuv == YUVDECODE) {
               tjc.close();  tjd.close();  return;
@@ -850,20 +853,18 @@ public class TJUnitTest {
       if(doyuv) yuv = YUVENCODE;
       doTest(35, 39, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_444, testName);
       doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_444, testName);
-      if(doyuv) {
-        doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_422,
-          testName);
-        doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_422,
-          testName);
-        doTest(39, 41, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_420,
-          testName);
-        doTest(41, 35, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_420,
-          testName);
-        doTest(35, 39, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_440,
-          testName);
-        doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_440,
-          testName);
-      }
+      doTest(41, 35, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_422,
+        testName);
+      doTest(35, 39, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_422,
+        testName);
+      doTest(39, 41, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_420,
+        testName);
+      doTest(41, 35, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_420,
+        testName);
+      doTest(35, 39, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_440,
+        testName);
+      doTest(39, 41, bi ? _4byteFormatsBI : _4byteFormats, TJ.SAMP_440,
+        testName);
       doTest(35, 39, bi ? onlyGrayBI : onlyGray, TJ.SAMP_GRAY, testName);
       doTest(39, 41, bi ? _3byteFormatsBI : _3byteFormats, TJ.SAMP_GRAY,
         testName);
diff --git a/jdmerge.c b/jdmerge.c
index cfa3bb92..53361252 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -103,6 +103,7 @@ typedef my_upsampler * my_upsample_ptr;
 #define RGB_RED EXT_RGBX_RED
 #define RGB_GREEN EXT_RGBX_GREEN
 #define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 #define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
 #define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
@@ -110,6 +111,7 @@ typedef my_upsampler * my_upsample_ptr;
 #undef RGB_RED
 #undef RGB_GREEN
 #undef RGB_BLUE
+#undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
@@ -131,6 +133,7 @@ typedef my_upsampler * my_upsample_ptr;
 #define RGB_RED EXT_BGRX_RED
 #define RGB_GREEN EXT_BGRX_GREEN
 #define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 #define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
 #define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
@@ -138,6 +141,7 @@ typedef my_upsampler * my_upsample_ptr;
 #undef RGB_RED
 #undef RGB_GREEN
 #undef RGB_BLUE
+#undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
@@ -145,6 +149,7 @@ typedef my_upsampler * my_upsample_ptr;
 #define RGB_RED EXT_XBGR_RED
 #define RGB_GREEN EXT_XBGR_GREEN
 #define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 #define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
 #define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
@@ -152,6 +157,7 @@ typedef my_upsampler * my_upsample_ptr;
 #undef RGB_RED
 #undef RGB_GREEN
 #undef RGB_BLUE
+#undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
@@ -159,6 +165,7 @@ typedef my_upsampler * my_upsample_ptr;
 #define RGB_RED EXT_XRGB_RED
 #define RGB_GREEN EXT_XRGB_GREEN
 #define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 #define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
 #define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
@@ -166,6 +173,7 @@ typedef my_upsampler * my_upsample_ptr;
 #undef RGB_RED
 #undef RGB_GREEN
 #undef RGB_BLUE
+#undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
diff --git a/jdmrgext.c b/jdmrgext.c
index 95ddd556..2b932655 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -2,6 +2,7 @@
  * jdmrgext.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 2011, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -54,11 +55,17 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr[RGB_ALPHA] = 0xFF;
+#endif
     outptr += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr0++);
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr[RGB_ALPHA] = 0xFF;
+#endif
     outptr += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -72,6 +79,9 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr[RGB_ALPHA] = 0xFF;
+#endif
   }
 }
 
@@ -120,21 +130,33 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr0[RGB_ALPHA] = 0xFF;
+#endif
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr00++);
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr0[RGB_ALPHA] = 0xFF;
+#endif
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr1[RGB_ALPHA] = 0xFF;
+#endif
     outptr1 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr1[RGB_ALPHA] = 0xFF;
+#endif
     outptr1 += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -148,9 +170,15 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr0[RGB_ALPHA] = 0xFF;
+#endif
     y  = GETJSAMPLE(*inptr01);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#ifdef RGB_ALPHA
+    outptr1[RGB_ALPHA] = 0xFF;
+#endif
   }
 }
diff --git a/tjbench.c b/tjbench.c
index f298732c..5f5b9e94 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -681,7 +681,7 @@ void usage(char *progname)
 	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
 	printf("     Test the specified color conversion path in the codec (default: BGR)\n");
 	printf("-fastupsample = Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
-	printf("     YUV decoding in libjpeg decompressor\n");
+	printf("     YUV decoding\n");
 	printf("-quiet = Output results in tabular rather than verbose format\n");
 	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
 	printf("-yuvdecode = Decode JPEG image to planar YUV rather than RGB\n");
diff --git a/tjunittest.c b/tjunittest.c
index d14ec528..89a6d1db 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2012 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -77,7 +77,7 @@ const int _onlyGray[]={TJPF_GRAY};
 const int _onlyRGB[]={TJPF_RGB};
 
 enum {YUVENCODE=1, YUVDECODE};
-int yuv=0, alloc=0, alpha=0;
+int yuv=0, alloc=0;
 
 int exitStatus=0;
 #define bailout() {exitStatus=-1;  goto bailout;}
@@ -502,6 +502,8 @@ void doTest(int w, int h, const int *formats, int nformats, int subsamp,
 		for(i=0; i<2; i++)
 		{
 			int flags=0;
+			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440)
+				flags|=TJFLAG_FASTUPSAMPLE;
 			if(i==1)
 			{
 				if(yuv==YUVDECODE) goto bailout;
@@ -617,15 +619,12 @@ int main(int argc, char *argv[])
 	if(doyuv) {yuv=YUVENCODE;  alloc=0;}
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
 	doTest(39, 41, _4byteFormats, 4, TJSAMP_444, "test");
-	if(doyuv)
-	{
-		doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
-		doTest(35, 39, _4byteFormats, 4, TJSAMP_422, "test");
-		doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
-		doTest(41, 35, _4byteFormats, 4, TJSAMP_420, "test");
-		doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
-		doTest(39, 41, _4byteFormats, 4, TJSAMP_440, "test");
-	}
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
+	doTest(35, 39, _4byteFormats, 4, TJSAMP_422, "test");
+	doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
+	doTest(41, 35, _4byteFormats, 4, TJSAMP_420, "test");
+	doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
+	doTest(39, 41, _4byteFormats, 4, TJSAMP_440, "test");
 	doTest(35, 39, _onlyGray, 1, TJSAMP_GRAY, "test");
 	doTest(39, 41, _3byteFormats, 2, TJSAMP_GRAY, "test");
 	doTest(41, 35, _4byteFormats, 4, TJSAMP_GRAY, "test");

From 8015a303086599719b89c1d3a2c3c56405451ef0 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Sat, 17 Mar 2012 14:32:38 +0000
Subject: [PATCH 17/26] Visual Studio 2010 doesn't like	the wildcard at
 compile	time, so let CMake expand it instead.

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@813 632fc199-4ca6-4c93-a231-07263d6284db
---
 simd/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 0eacebe1..06bbae4f 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -37,6 +37,8 @@ else()
   set(OBJDIR ${CMAKE_CURRENT_BINARY_DIR})
 endif()
 
+file(GLOB INC_FILES *.inc)
+
 foreach(file ${SIMD_BASENAMES})
   set(DEPFILE "")
   set(SIMD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/${file}.asm)
@@ -53,7 +55,8 @@ foreach(file ${SIMD_BASENAMES})
     string(REGEX REPLACE "gra" "gry" DEPFILE ${DEPFILE})
   endif()
   set(SIMD_OBJ ${OBJDIR}/${file}.obj)
-  add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${SIMD_SRC} ${DEPFILE} *.inc
+  add_custom_command(OUTPUT ${SIMD_OBJ}
+    DEPENDS ${SIMD_SRC} ${DEPFILE} ${INC_FILES}
     COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ})
   set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
 endforeach()

From 68071bd2e427cf1f68c40dc771b23a0046ddb4e1 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Thu, 22 Mar 2012 22:05:09 +0000
Subject: [PATCH 18/26] 1.2.1

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@814 632fc199-4ca6-4c93-a231-07263d6284db
---
 CMakeLists.txt | 2 +-
 configure.ac   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d95c9e68..e7cbbf95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 cmake_minimum_required(VERSION 2.6)
 
 project(libjpeg-turbo C)
-set(VERSION 1.2.0)
+set(VERSION 1.2.1)
 
 if(MINGW OR CYGWIN)
   execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
diff --git a/configure.ac b/configure.ac
index 2930c27c..59f50ff5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.2.0])
+AC_INIT([libjpeg-turbo], [1.2.1])
 BUILD=`date +%Y%m%d`
 
 AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])

From 7c6ae034da54989dbe3e6254492e68efa5ec9179 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 23 Mar 2012 00:51:56 +0000
Subject: [PATCH 19/26] Fix universal DMG build

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@816 632fc199-4ca6-4c93-a231-07263d6284db
---
 Makefile.am           |  6 +++---
 release/makemacpkg.in | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 0fef9fa5..aa2426be 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -302,15 +302,15 @@ deb: all
 if X86_64
 
 udmg: all
-	sh pkgscripts/makemacpkg -builddir32 ${BUILDDIR32}
+	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
 
 iosdmg: all
-	sh pkgscripts/makemacpkg -builddir32 ${BUILDDIR32} -builddirarmv6 ${BUILDDIRARMV6} -builddirarmv7 ${BUILDDIRARMV7}
+	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7}
 
 else
 
 iosdmg: all
-	sh pkgscripts/makemacpkg -builddirarmv6 ${BUILDDIRARMV6} -builddirarmv7 ${BUILDDIRARMV7}
+	sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7}
 
 endif
 
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 6cb4069d..7b43aa37 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -17,7 +17,7 @@ onexit()
 
 usage()
 {
-	echo "$0 [-builddir32 [32-bit build dir]] [-builddirarmv6 [ARM v6 build dir]] [-builddirarmv7 [ARM v7 build dir]]"
+	echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARM v6 build dir]] [-buildarmv7 [ARM v7 build dir]]"
 	exit 1
 }
 
@@ -26,27 +26,33 @@ VERSION=@VERSION@
 BUILD=@BUILD@
 SRCDIR=@abs_top_srcdir@
 BUILDDIR32=@abs_top_srcdir@/osxx86
+BUILD32=0
 BUILDDIRARMV6=@abs_top_srcdir@/iosarmv6
+BUILDARMV6=0
 BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
+BUILDARMV7=0
 WITH_JAVA=@WITH_JAVA@
 while [ $# -gt 0 ]; do
 	case $1 in
 	-h*)             usage 0                   ;;
-	-builddir32)
+	-build32)
+		BUILD32=1
 		if [ $# -gt 1 ]; then
 			if [[ ! "$2" =~ -.* ]]; then
 				BUILDDIR32=$2;  shift
 			fi
 		fi
 		;;
-	-builddirarmv6)
+	-buildarmv6)
+		BUILDARMV6=1
 		if [ $# -gt 1 ]; then
 			if [[ ! "$2" =~ -.* ]]; then
 				BUILDDIRARMV6=$2;  shift
 			fi
 		fi
 		;;
-	-builddirarmv7)
+	-buildarmv7)
+		BUILDARMV7=1
 		if [ $# -gt 1 ]; then
 			if [[ ! "$2" =~ -.* ]]; then
 				BUILDDIRARMV7=$2;  shift
@@ -76,7 +82,7 @@ mv $PKGROOT/opt/$PACKAGE_NAME/lib/libturbojpeg.* $PKGROOT/usr/lib
 mkdir -p $PKGROOT/usr/include
 mv $PKGROOT/opt/$PACKAGE_NAME/include/turbojpeg.h $PKGROOT/usr/include
 
-if [ ! "$BUILDDIR32" = "" ]; then
+if [ $BUILD32 = 1 ]; then
 	if [ ! -d $BUILDDIR32 ]; then
 		echo ERROR: 32-bit build directory $BUILDDIR32 does not exist
 		exit 1
@@ -141,7 +147,7 @@ if [ ! "$BUILDDIR32" = "" ]; then
 
 fi
 
-if [ ! "$BUILDDIRARMV6" = "" ]; then
+if [ $BUILDARMV6 = 1 ]; then
 	if [ ! -d $BUILDDIRARMV6 ]; then
 		echo ERROR: ARM v6 build directory $BUILDDIRARMV6 does not exist
 		exit 1
@@ -164,7 +170,7 @@ if [ ! "$BUILDDIRARMV6" = "" ]; then
 		-output $PKGROOT/usr/lib/libturbojpeg.a
 fi
 
-if [ ! "$BUILDDIRARMV7" = "" ]; then
+if [ $BUILDARMV7 = 1 ]; then
 	if [ ! -d $BUILDDIRARMV7 ]; then
 		echo ERROR: ARM v7 build directory $BUILDDIRARMV7 does not exist
 		exit 1

From 0559e94c725648b25bb07fdcde89513225de14ab Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 23 Mar 2012 03:12:35 +0000
Subject: [PATCH 20/26] "Sun Studio"="Oracle Solaris Studio"

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@818 632fc199-4ca6-4c93-a231-07263d6284db
---
 BUILDING.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/BUILDING.txt b/BUILDING.txt
index fc583ee3..8f19edc1 100644
--- a/BUILDING.txt
+++ b/BUILDING.txt
@@ -258,8 +258,8 @@ to the configure command line.  NASM 2.07 or later from FreeBSD ports must be
 installed.
 
 
-Sun Studio
-----------
+Oracle Solaris Studio
+---------------------
 
 Add
 
@@ -268,7 +268,7 @@ Add
 to the configure command line.  libjpeg-turbo will automatically be built with
 the maximum optimization level (-xO5) unless you override CFLAGS.
 
-To build a 64-bit version of libjpeg-turbo using Sun Studio, add
+To build a 64-bit version of libjpeg-turbo using Oracle Solaris Studio, add
 
   --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
 

From 2eda8212e4b01c9b4d343dd0eaa579f0bba036e7 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 23 Mar 2012 19:32:38 +0000
Subject: [PATCH 21/26] Ensure that tjDecompress2() exits cleanly if
 setDecompDefaults() fails

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@821 632fc199-4ca6-4c93-a231-07263d6284db
---
 turbojpeg.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/turbojpeg.c b/turbojpeg.c
index 2d5959a3..37ad5a37 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2012 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -756,7 +756,10 @@ DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle, unsigned char *jpegBuf,
 
 	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 	jpeg_read_header(dinfo, TRUE);
-	if(setDecompDefaults(dinfo, pixelFormat)==-1) return -1;
+	if(setDecompDefaults(dinfo, pixelFormat)==-1)
+	{
+		retval=-1;  goto bailout;
+	}
 
 	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
 

From d4c15e103cc91dcb6e8ca5291e490a9c138026b2 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Fri, 23 Mar 2012 19:39:14 +0000
Subject: [PATCH 22/26] Whitespace tweak

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@823 632fc199-4ca6-4c93-a231-07263d6284db
---
 turbojpeg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/turbojpeg.c b/turbojpeg.c
index 37ad5a37..e08c767f 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -498,7 +498,7 @@ DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
 	JSAMPROW *outbuf[MAX_COMPONENTS];
 	int row, pw, ph, cw[MAX_COMPONENTS], ch[MAX_COMPONENTS];
 	JSAMPLE *ptr=dstBuf;
-  unsigned long yuvsize=0;
+	unsigned long yuvsize=0;
 	jpeg_component_info *compptr;
 
 	getinstance(handle);

From 4f24016bde11aa4282900ff071a6c8481027e063 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Thu, 26 Apr 2012 19:50:37 +0000
Subject: [PATCH 23/26] Preserve all 128 bits of xmm6 and xmm7

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@829 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt     | 4 ++++
 simd/jsimdext.inc | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index b0d06a76..8a8a29b5 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -11,6 +11,10 @@ decompression, the unused byte of the decompressed pixels was not being set to
 0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
 correct behavior of the colorspace extensions when merged upsampling is used.
 
+[3] Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
 
 1.2.0
 =====
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index 4ab9bc0f..1d4d3e2d 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -322,15 +322,15 @@ const_base:
 	push rsi
 	push rdi
 	sub     rsp, SIZEOF_XMMWORD
-	movlpd  XMMWORD [rsp], xmm6
+	movaps  XMMWORD [rsp], xmm6
 	sub     rsp, SIZEOF_XMMWORD
-	movlpd  XMMWORD [rsp], xmm7
+	movaps  XMMWORD [rsp], xmm7
 %endmacro
 
 %imacro uncollect_args 0
-	movlpd  xmm7, XMMWORD [rsp]
+	movaps  xmm7, XMMWORD [rsp]
 	add     rsp, SIZEOF_XMMWORD
-	movlpd  xmm6, XMMWORD [rsp]
+	movaps  xmm6, XMMWORD [rsp]
 	add     rsp, SIZEOF_XMMWORD
 	pop rdi
 	pop rsi

From dd2b651243125701dca2ed2f31b3d34056719b9c Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 30 May 2012 20:36:42 +0000
Subject: [PATCH 24/26] Guard against num_components being a ridiculous	value
 due to a corrupt header

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@831 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt | 4 ++++
 jdmarker.c    | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 8a8a29b5..3775d543 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -15,6 +15,10 @@ correct behavior of the colorspace extensions when merged upsampling is used.
 upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
 calling conventions.
 
+[4] Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
+images (specifically, images in which the component count was erroneously set
+to a large value) would cause libjpeg-turbo to segfault.
+
 
 1.2.0
 =====
diff --git a/jdmarker.c b/jdmarker.c
index d8dcba98..6fc0f7dc 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -323,14 +323,15 @@ get_sos (j_decompress_ptr cinfo)
 
   /* Collect the component-spec parameters */
 
-  for (i = 0; i < cinfo->num_components; i++)
+  for (i = 0; i < MAX_COMPS_IN_SCAN; i++)
     cinfo->cur_comp_info[i] = NULL;
 
   for (i = 0; i < n; i++) {
     INPUT_BYTE(cinfo, cc, return FALSE);
     INPUT_BYTE(cinfo, c, return FALSE);
     
-    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+    for (ci = 0, compptr = cinfo->comp_info;
+	 ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
 	 ci++, compptr++) {
       if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
 	goto id_found;

From 69799275be895de4963bb22c975081cb53a147a0 Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 13 Jun 2012 01:21:29 +0000
Subject: [PATCH 25/26] Eliminate the use of the MASKMOVDQU instruction, to
 speed up decompression performance by 10x on AMD Bobcat embedded processors
 (and ~5% on AMD desktop processors.)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt        |  7 +++++++
 simd/jdclrss2-64.asm | 49 +++++++++++++++++--------------------------
 simd/jdclrss2.asm    | 49 +++++++++++++++++--------------------------
 simd/jdmrgss2-64.asm | 50 +++++++++++++++++---------------------------
 simd/jdmrgss2.asm    | 49 +++++++++++++++++--------------------------
 5 files changed, 83 insertions(+), 121 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 3775d543..e80ac6c3 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -19,6 +19,13 @@ calling conventions.
 images (specifically, images in which the component count was erroneously set
 to a large value) would cause libjpeg-turbo to segfault.
 
+[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors.  The MASKMOVDQU instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular.  Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
 
 1.2.0
 =====
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
index 696a383b..06cb213c 100644
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	rcx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	rcx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
@@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	por	xmmA,xmmG
 	por	xmmE,xmmC
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	rcx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
@@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	por	xmmA,xmmB
 	por	xmmE,xmmG
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
index 7f519e6f..1354c3dc 100644
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm
@@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	ecx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	ecx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
@@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	por	xmmA,xmmG
 	por	xmmE,xmmC
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	ecx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
@@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 	por	xmmA,xmmB
 	por	xmmE,xmmG
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index a64a6b33..ffe02882 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	jmp	near .columnloop
 
 .column_st32:
-	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
 	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	rcx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	rcx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
@@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	por	xmmA,xmmG
 	por	xmmE,xmmC
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [rdi],xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	rcx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
@@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	por	xmmA,xmmB
 	por	xmmE,xmmG
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [rdi],xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index 04089aa3..556a4906 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	ecx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	ecx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
@@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	por	xmmA,xmmG
 	por	xmmE,xmmC
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
@@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	ecx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
@@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 	por	xmmA,xmmB
 	por	xmmE,xmmG
 .adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi], xmmA
 %endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------

From 316617faf4a9c5f00bf76f4a0e9c9864d65ec97f Mon Sep 17 00:00:00 2001
From: DRC <dcommander@users.sourceforge.net>
Date: Wed, 13 Jun 2012 05:17:03 +0000
Subject: [PATCH 26/26] Accelerated 4:2:2 upsampling routine for ARM (improves
 performance ~20-30% when decompressing 4:2:2 JPEGs using fancy upsampling)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@837 632fc199-4ca6-4c93-a231-07263d6284db
---
 ChangeLog.txt         |   4 +
 simd/jsimd.h          |   4 +
 simd/jsimd_arm.c      |  12 +++
 simd/jsimd_arm_neon.S | 238 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 258 insertions(+)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index e80ac6c3..7e4b4ea1 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -26,6 +26,10 @@ it is painfully slow on Bobcat processors in particular.  Eliminating the use
 of this instruction improved performance by an order of magnitude on Bobcat
 processors and by a small amount (typically 5%) on AMD desktop processors.
 
+[6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
 
 1.2.0
 =====
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 6ee99cc6..3d4751ff 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index af0c2c8a..cae84df0 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
                            JSAMPARRAY input_data,
                            JSAMPARRAY * output_data_ptr)
 {
+  if (simd_support & JSIMD_ARM_NEON)
+    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
 }
 
 GLOBAL(int)
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index b2f9c2ae..9962b8a1 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
     .unreq          SHIFT
     .unreq          LOOP_COUNT
 .endfunc
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
+ *                                 JDIMENSION   downsampled_width,
+ *                                 JSAMPARRAY   input_data,
+ *                                 JSAMPARRAY * output_data_ptr);
+ *
+ * Note: the use of unaligned writes is the main remaining bottleneck in
+ *       this code, which can be potentially solved to get up to tens
+ *       of percents performance improvement on Cortex-A8/Cortex-A9.
+ */
+
+/*
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
+ * Register d28 is used for multiplication by 3. Register q15 is used
+ * for adding +1 bias.
+ */
+.macro upsample16   OUTPTR, INPTR
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vmov            q1,  q0       /* backup source pixels to q1 */
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
+ * Also this unrolling allows to reorder loads and stores to compensate
+ * multiplication latency and reduce stalls.
+ */
+.macro upsample32   OUTPTR, INPTR
+    /* even 16 pixels group */
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+        /* odd 16 pixels group */
+        vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+        vmovl.u8        q8,  d2
+        vext.8          q2,  q0,  q1, #15
+        vmovl.u8        q9,  d3
+        vaddw.u8        q10, q15, d4
+        vaddw.u8        q11, q15, d5
+        vmlal.u8        q8,  d4,  d28
+        vmlal.u8        q9,  d5,  d28
+        vmlal.u8        q10, d2,  d28
+        vmlal.u8        q11, d3,  d28
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+        vrshrn.u16      d6,  q8,  #2
+        vrshrn.u16      d7,  q9,  #2
+        vshrn.u16       d8,  q10, #2
+        vshrn.u16       d9,  q11, #2
+        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
+ */
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
+    /* special case for the first and last pixels */
+    sub             \WIDTH, \WIDTH, #1
+    add             \OUTPTR, \OUTPTR, #1
+    ldrb            \TMP1, [\INPTR, \WIDTH]
+    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
+    ldrb            \TMP1, [\INPTR], #1
+    strb            \TMP1, [\OUTPTR, #-1]
+    vmov.8          d3[7], \TMP1
+
+    subs            \WIDTH, \WIDTH, #32
+    blt             5f
+0:  /* process 32 pixels per iteration */
+    upsample32      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #32
+    bge             0b
+5:
+    adds            \WIDTH, \WIDTH, #16
+    blt             1f
+0:  /* process 16 pixels if needed */
+    upsample16      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #16
+1:
+    adds            \WIDTH, \WIDTH, #16
+    beq             9f
+
+    /* load the remaining 1-15 pixels */
+    add             \INPTR, \INPTR, \WIDTH
+    tst             \WIDTH, #1
+    beq             2f
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vext.8          d0, d0, d0, #6
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vrev64.32       d0, d0
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[3]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[2]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #8
+    beq             2f
+    vmov            d1,  d0
+    sub             \INPTR, \INPTR, #8
+    vld1.8          {d0}, [\INPTR]
+2:  /* upsample the remaining pixels */
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vrshrn.u16      d10, q8,  #2
+    vrshrn.u16      d12, q9,  #2
+    vshrn.u16       d11, q10, #2
+    vshrn.u16       d13, q11, #2
+    vzip.8          d10, d11
+    vzip.8          d12, d13
+    /* store the remaining pixels */
+    tst             \WIDTH, #8
+    beq             2f
+    vst1.8          {d10, d11}, [\OUTPTR]!
+    vmov            q5,  q6
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vst1.8          {d10}, [\OUTPTR]!
+    vmov            d10,  d11
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+    vst1.8          {d10[2]}, [\OUTPTR]!
+    vst1.8          {d10[3]}, [\OUTPTR]!
+    vext.8          d10, d10, d10, #4
+2:
+    tst             \WIDTH, #1
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+2:
+9:
+.endm
+
+asm_function jsimd_h2v1_fancy_upsample_neon
+
+    MAX_V_SAMP_FACTOR .req r0
+    DOWNSAMPLED_WIDTH .req r1
+    INPUT_DATA        .req r2
+    OUTPUT_DATA_PTR   .req r3
+    OUTPUT_DATA       .req OUTPUT_DATA_PTR
+
+    OUTPTR            .req r4
+    INPTR             .req r5
+    WIDTH             .req ip
+    TMP               .req lr
+
+    push            {r4, r5, r6, lr}
+    vpush           {d8-d15}
+
+    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
+    cmp             MAX_V_SAMP_FACTOR, #0
+    ble             99f
+
+    /* initialize constants */
+    vmov.u8         d28, #3
+    vmov.u16        q15, #1
+11:
+    ldr             INPTR, [INPUT_DATA], #4
+    ldr             OUTPTR, [OUTPUT_DATA], #4
+    mov             WIDTH, DOWNSAMPLED_WIDTH
+    upsample_row    OUTPTR, INPTR, WIDTH, TMP
+    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
+    bgt             11b
+
+99:
+    vpop            {d8-d15}
+    pop             {r4, r5, r6, pc}
+
+    .unreq          MAX_V_SAMP_FACTOR
+    .unreq          DOWNSAMPLED_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA_PTR
+    .unreq          OUTPUT_DATA
+
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          WIDTH
+    .unreq          TMP
+
+.endfunc
+
+.purgem upsample16
+.purgem upsample32
+.purgem upsample_row