From 1216e9530fe541df6b95ffa39d00df9919cee4fa Mon Sep 17 00:00:00 2001
From: Michael R Sweet <michael.r.sweet@gmail.com>
Date: Tue, 15 Feb 2011 15:25:53 +0000
Subject: Update JPEG library to v8b.

git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@8425 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
---
 jpeg/CMakeLists.txt |   12 +-
 jpeg/Makefile       |   59 +-
 jpeg/README         |  281 ++-
 jpeg/change.log     |   98 +
 jpeg/coderules.doc  |  118 --
 jpeg/coderules.txt  |  118 ++
 jpeg/filelist.doc   |  210 ---
 jpeg/filelist.txt   |  215 +++
 jpeg/install.txt    | 1096 ++++++++++++
 jpeg/jaricom.c      |  153 ++
 jpeg/jcapimin.c     |   10 +-
 jpeg/jcarith.c      |  934 ++++++++++
 jpeg/jccoefct.c     |   24 +-
 jpeg/jcdctmgr.c     |  489 +++--
 jpeg/jchuff.c       | 1013 +++++++++--
 jpeg/jchuff.h       |   47 -
 jpeg/jcinit.c       |   15 +-
 jpeg/jcmainct.c     |  102 +-
 jpeg/jcmarker.c     |   98 +-
 jpeg/jcmaster.c     |  320 +++-
 jpeg/jconfig.doc    |  155 --
 jpeg/jconfig.txt    |  164 ++
 jpeg/jcparam.c      |   72 +-
 jpeg/jcphuff.c      |  833 ---------
 jpeg/jcprepct.c     |   14 +-
 jpeg/jcsample.c     |   94 +-
 jpeg/jctrans.c      |   24 +-
 jpeg/jdapimin.c     |    5 +-
 jpeg/jdapistd.c     |    2 +-
 jpeg/jdarith.c      |  772 ++++++++
 jpeg/jdatadst.c     |  122 +-
 jpeg/jdatasrc.c     |   80 +-
 jpeg/jdcoefct.c     |   14 +-
 jpeg/jdct.h         |  239 ++-
 jpeg/jddctmgr.c     |  135 +-
 jpeg/jdhuff.c       | 1168 ++++++++++--
 jpeg/jdhuff.h       |  201 ---
 jpeg/jdinput.c      |  378 +++-
 jpeg/jdmainct.c     |  160 +-
 jpeg/jdmarker.c     |   74 +-
 jpeg/jdmaster.c     |  104 +-
 jpeg/jdphuff.c      |  668 -------
 jpeg/jdsample.c     |  147 +-
 jpeg/jdtrans.c      |   19 +-
 jpeg/jerror.c       |    1 -
 jpeg/jerror.h       |   19 +-
 jpeg/jfdctflt.c     |   48 +-
 jpeg/jfdctfst.c     |   48 +-
 jpeg/jfdctint.c     | 4269 ++++++++++++++++++++++++++++++++++++++++++--
 jpeg/jidctflt.c     |   57 +-
 jpeg/jidctint.c     | 4954 +++++++++++++++++++++++++++++++++++++++++++++++++--
 jpeg/jidctred.c     |  398 -----
 jpeg/jmorecfg.h     |   73 +-
 jpeg/jpegint.h      |   43 +-
 jpeg/jpeglib.h      |  102 +-
 jpeg/jutils.c       |   52 +
 jpeg/jversion.h     |    6 +-
 jpeg/libjpeg.doc    | 3006 -------------------------------
 jpeg/libjpeg.txt    | 3070 +++++++++++++++++++++++++++++++
 jpeg/makedepend     |   19 +-
 jpeg/structure.doc  |  948 ----------
 jpeg/structure.txt  |  945 ++++++++++
 jpeg/usage.doc      |  562 ------
 jpeg/usage.txt      |  617 +++++++
 jpeg/wizard.doc     |  211 ---
 jpeg/wizard.txt     |  211 +++
 66 files changed, 21699 insertions(+), 9016 deletions(-)
 delete mode 100644 jpeg/coderules.doc
 create mode 100644 jpeg/coderules.txt
 delete mode 100644 jpeg/filelist.doc
 create mode 100644 jpeg/filelist.txt
 create mode 100644 jpeg/install.txt
 create mode 100644 jpeg/jaricom.c
 create mode 100644 jpeg/jcarith.c
 delete mode 100644 jpeg/jchuff.h
 delete mode 100644 jpeg/jconfig.doc
 create mode 100644 jpeg/jconfig.txt
 delete mode 100644 jpeg/jcphuff.c
 create mode 100644 jpeg/jdarith.c
 delete mode 100644 jpeg/jdhuff.h
 delete mode 100644 jpeg/jdphuff.c
 delete mode 100644 jpeg/jidctred.c
 delete mode 100644 jpeg/libjpeg.doc
 create mode 100644 jpeg/libjpeg.txt
 delete mode 100644 jpeg/structure.doc
 create mode 100644 jpeg/structure.txt
 delete mode 100644 jpeg/usage.doc
 create mode 100644 jpeg/usage.txt
 delete mode 100644 jpeg/wizard.doc
 create mode 100644 jpeg/wizard.txt

(limited to 'jpeg')

diff --git a/jpeg/CMakeLists.txt b/jpeg/CMakeLists.txt
index 2b41aee7c..ef1431784 100644
--- a/jpeg/CMakeLists.txt
+++ b/jpeg/CMakeLists.txt
@@ -12,22 +12,22 @@ set(systemdependent_SRCS jmemnobs.c)
 
 # library object files common to compression and decompression
 set(common_SRCS
-   jcomapi.c jutils.c jerror.c jmemmgr.c
+   jaricom.c jcomapi.c jutils.c jerror.c jmemmgr.c
 )
 
 # compression library object files
 set(compression_SRCS
-   jcapimin.c jcapistd.c jctrans.c jcparam.c jdatadst.c jcinit.c
+   jcapimin.c jcapistd.c jcarith.c jctrans.c jcparam.c jdatadst.c jcinit.c
    jcmaster.c jcmarker.c jcmainct.c jcprepct.c jccoefct.c jccolor.c
-   jcsample.c jchuff.c jcphuff.c jcdctmgr.c jfdctfst.c jfdctflt.c
+   jcsample.c jchuff.c jcdctmgr.c jfdctfst.c jfdctflt.c
    jfdctint.c
 )
 
 # decompression library object files
 set(decompression_SRCS
-   jdapimin.c jdapistd.c jdtrans.c jdatasrc.c jdmaster.c
-   jdinput.c jdmarker.c jdhuff.c jdphuff.c jdmainct.c jdcoefct.c
-   jdpostct.c jddctmgr.c jidctfst.c jidctflt.c jidctint.c jidctred.c
+   jdapimin.c jdapistd.c jdarith.c jdtrans.c jdatasrc.c jdmaster.c
+   jdinput.c jdmarker.c jdhuff.c jdmainct.c jdcoefct.c
+   jdpostct.c jddctmgr.c jidctfst.c jidctflt.c jidctint.c
    jdsample.c jdcolor.c jquant1.c jquant2.c jdmerge.c
 )
 
diff --git a/jpeg/Makefile b/jpeg/Makefile
index 8c7fa57bc..3f62676ed 100644
--- a/jpeg/Makefile
+++ b/jpeg/Makefile
@@ -3,7 +3,7 @@
 #
 # JPEG library makefile for the Fast Light Toolkit (FLTK).
 #
-# Copyright 1997-2009 by Easy Software Products.
+# Copyright 1997-2011 by Bill Spitzak and others.
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Library General Public
@@ -32,16 +32,53 @@ include ../makeinclude
 # Object files...
 #
 
-OBJS	=	jmemnobs.o \
-		jcapimin.o jcapistd.o jccoefct.o jccolor.o jcdctmgr.o \
-		jchuff.o jcinit.o jcmainct.o jcmarker.o jcmaster.o jcomapi.o \
-		jcparam.o jcphuff.o jcprepct.o jcsample.o jctrans.o \
-		jdapimin.o jdapistd.o jdatadst.o jdatasrc.o jdcoefct.o \
-		jdcolor.o jddctmgr.o jdhuff.o jdinput.o jdmainct.o jdmarker.o \
-		jdmaster.o jdmerge.o jdphuff.o jdpostct.o jdsample.o \
-		jdtrans.o jerror.o jfdctflt.o jfdctfst.o jfdctint.o \
-		jidctflt.o jidctfst.o jidctint.o jidctred.o jquant1.o \
-		jquant2.o jutils.o jmemmgr.o
+OBJS	=	\
+		jaricom.o \
+		jcapimin.o \
+		jcapistd.o \
+		jcarith.o \
+		jccoefct.o \
+		jccolor.o \
+		jcdctmgr.o \
+		jchuff.o \
+		jcinit.o \
+		jcmainct.o \
+		jcmarker.o \
+		jcmaster.o \
+		jcomapi.o \
+		jcparam.o \
+		jcprepct.o \
+		jcsample.o \
+		jctrans.o \
+		jdapimin.o \
+		jdapistd.o \
+		jdarith.o \
+		jdatadst.o \
+		jdatasrc.o \
+		jdcoefct.o \
+		jdcolor.o \
+		jddctmgr.o \
+		jdhuff.o \
+		jdinput.o \
+		jdmainct.o \
+		jdmarker.o \
+		jdmaster.o \
+		jdmerge.o \
+		jdpostct.o \
+		jdsample.o \
+		jdtrans.o \
+		jerror.o \
+		jfdctflt.o \
+		jfdctfst.o \
+		jfdctint.o \
+		jidctflt.o \
+		jidctfst.o \
+		jidctint.o \
+		jmemmgr.o \
+		jmemnobs.o \
+		jquant1.o \
+		jquant2.o \
+		jutils.o
 
 LIBJPEG	=	../lib/libfltk_jpeg$(LIBEXT)
 
diff --git a/jpeg/README b/jpeg/README
index 86cc20669..e923a3200 100644
--- a/jpeg/README
+++ b/jpeg/README
@@ -1,22 +1,17 @@
 The Independent JPEG Group's JPEG software
 ==========================================
 
-README for release 6b of 27-Mar-1998
+README for release 8b of 16-May-2010
 ====================================
 
-This distribution contains the sixth public release of the Independent JPEG
+This distribution contains the eighth public release of the Independent JPEG
 Group's free JPEG software.  You are welcome to redistribute this software and
 to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.
 
-Serious users of this software (particularly those incorporating it into
-larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
-our electronic mailing list.  Mailing list members are notified of updates
-and have a chance to participate in technical discussions, etc.
-
-This software is the work of Tom Lane, Philip Gladstone, Jim Boucher,
-Lee Crocker, Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi,
-Guido Vollbeding, Ge' Weijers, and other members of the Independent JPEG
-Group.
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.
 
 IJG is not affiliated with the official ISO JPEG standards committee.
 
@@ -30,27 +25,27 @@ OVERVIEW            General description of JPEG and the IJG software.
 LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
 REFERENCES          Where to learn more about JPEG.
 ARCHIVE LOCATIONS   Where to find newer versions of this software.
-RELATED SOFTWARE    Other stuff you should get.
+ACKNOWLEDGMENTS     Special thanks.
 FILE FORMAT WARS    Software *not* to get.
 TO DO               Plans for future IJG releases.
 
 Other documentation files in the distribution are:
 
 User documentation:
-  install.doc       How to configure and install the IJG software.
-  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+  install.txt       How to configure and install the IJG software.
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
-  *.1               Unix-style man pages for programs (same info as usage.doc).
-  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
-  libjpeg.doc       How to use the JPEG library in your own programs.
+  libjpeg.txt       How to use the JPEG library in your own programs.
   example.c         Sample code for calling the JPEG library.
-  structure.doc     Overview of the JPEG library's internal structure.
-  filelist.doc      Road map of IJG files.
-  coderules.doc     Coding style rules --- please read if you contribute code.
+  structure.txt     Overview of the JPEG library's internal structure.
+  filelist.txt      Road map of IJG files.
+  coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.doc and usage.doc.  Useful information
+Please read at least the files install.txt and usage.txt.  Some information
 can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
 ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
 
@@ -62,24 +57,15 @@ the order listed) before diving into the code.
 OVERVIEW
 ========
 
-This package contains C software to implement JPEG image compression and
-decompression.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG is intended for compressing
-"real-world" scenes; line drawings, cartoons and other non-realistic images
-are not its strong suit.  JPEG is lossy, meaning that the output image is not
-exactly identical to the input image.  Hence you must not use JPEG if you
-have to have identical output bits.  However, on typical photographic images,
-very good compression levels can be obtained with no visible change, and
-remarkably high compression levels are possible if you can tolerate a
-low-quality image.  For more details, see the references, or just experiment
-with various compression settings.
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.
 
 This software implements JPEG baseline, extended-sequential, and progressive
 compression processes.  Provision is made for supporting all variants of these
 processes, although some uncommon parameter settings aren't implemented yet.
-For legal reasons, we are not distributing code for the arithmetic-coding
-variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
-the hierarchical or lossless processes defined in the standard.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.
 
 We provide a set of library routines for reading and writing JPEG image files,
 plus two sample applications "cjpeg" and "djpeg", which use the library to
@@ -91,10 +77,11 @@ considerable functionality beyond the bare JPEG coding/decoding capability;
 for example, the color quantization modules are not strictly part of JPEG
 decoding, but they are essential for output to colormapped file formats or
 colormapped displays.  These extra functions can be compiled out of the
-library if not required for a particular application.  We have also included
-"jpegtran", a utility for lossless transcoding between different JPEG
-processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
-inserting and extracting textual comments in JFIF files.
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.
 
 The emphasis in designing this software has been on achieving portability and
 flexibility, while also making it fast enough to be useful.  In particular,
@@ -127,7 +114,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -170,17 +157,8 @@ the foregoing paragraphs do.
 The Unix configuration script "configure" was produced with GNU Autoconf.
 It is copyright by the Free Software Foundation but is freely distributable.
 The same holds for its supporting scripts (config.guess, config.sub,
-ltconfig, ltmain.sh).  Another support script, install-sh, is copyright
-by M.I.T. but is also freely distributable.
-
-It appears that the arithmetic coding option of the JPEG spec is covered by
-patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
-legally be used without obtaining one or more licenses.  For this reason,
-support for arithmetic coding has been removed from the free JPEG software.
-(Since arithmetic coding provides only a marginal gain over the unpatented
-Huffman mode, it is unlikely that very many implementations will support it.)
-So far as we are aware, there are no patent restrictions on the remaining
-code.
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.
 
 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent, GIF reading support has
@@ -198,7 +176,7 @@ We are required to state that
 REFERENCES
 ==========
 
-We highly recommend reading one or more of these references before trying to
+We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.
 
 The best short technical introduction to the JPEG compression algorithm is
@@ -207,7 +185,7 @@ The best short technical introduction to the JPEG compression algorithm is
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PostScript file containing a revised version of Wallace's article is
-available at ftp://ftp.uu.net/graphics/jpeg/wallace.ps.gz.  The file (actually
+available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -222,82 +200,65 @@ code but don't know much about data compression in general.  The book's JPEG
 sample code is far from industrial-strength, but when you are ready to look
 at a full implementation, you've got one here...
 
-The best full description of JPEG is the textbook "JPEG Still Image Data
-Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
-by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
-The book includes the complete text of the ISO JPEG standards (DIS 10918-1
-and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
-in existence, and we highly recommend it.
-
-The JPEG standard itself is not available electronically; you must order a
-paper copy through ISO or ITU.  (Unless you feel a need to own a certified
-official copy, we recommend buying the Pennebaker and Mitchell book instead;
-it's much cheaper and includes a great deal of useful explanatory material.)
-In the USA, copies of the standard may be ordered from ANSI Sales at (212)
-642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
-doesn't take credit card orders, but Global does.)  It's not cheap: as of
-1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
-shipping/handling.  The standard is divided into two parts, Part 1 being the
-actual specification, while Part 2 covers compliance testing methods.  Part 1
-is titled "Digital Compression and Coding of Continuous-tone Still Images,
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+Although this is by far the most detailed and comprehensive exposition of
+JPEG publicly available, we point out that it is still missing an explanation
+of the most essential properties and algorithms of the underlying DCT
+technology.
+If you think that you know about DCT-based JPEG after reading this book,
+then you are in delusion.  The real fundamentals and corresponding potential
+of DCT-based JPEG are not publicly known so far, and that is the reason for
+all the mistaken developments taking place in the image coding domain.
+
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
 Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
 10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
 Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.
-
-Some extensions to the original JPEG standard are defined in JPEG Part 3,
-a newer ISO standard numbered ISO/IEC IS 10918-3 and ITU-T T.84.  IJG
-currently does not support any Part 3 extensions.
+IJG JPEG 8 introduces an implementation of the JPEG SmartScale extension
+which is specified in a contributed document at ITU and ISO with title "ITU-T
+JPEG-Plus Proposal for Extending ITU-T T.81 for Advanced Image Coding", April
+2006, Geneva, Switzerland.  The latest version of the document is Revision 3.
 
 The JPEG standard does not specify all details of an interchangeable file
 format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  A copy of the JFIF spec is available from:
-	Literature Department
-	C-Cube Microsystems, Inc.
-	1778 McCarthy Blvd.
-	Milpitas, CA 95035
-	phone (408) 944-6300,  fax (408) 944-6314
-A PostScript version of this document is available by FTP at
-ftp://ftp.uu.net/graphics/jpeg/jfif.ps.gz.  There is also a plain text
-version at ftp://ftp.uu.net/graphics/jpeg/jfif.txt.gz, but it is missing
-the figures.
+1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
+and thus received a formal publication status.  It is available as a free
+download in PDF format from
+http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
+A PostScript version of the JFIF document is available at
+http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
+http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.
 
 The TIFF 6.0 file format specification can be obtained by FTP from
 ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
 found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
 IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
 Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from ftp.sgi.com or
-from ftp://ftp.uu.net/graphics/jpeg/.  It is expected that the next revision
+(Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
-uses our library to implement TIFF/JPEG per the Note.  libtiff is available
-from ftp://ftp.sgi.com/graphics/tiff/.
+uses our library to implement TIFF/JPEG per the Note.
 
 
 ARCHIVE LOCATIONS
 =================
 
-The "official" archive site for this software is ftp.uu.net (Internet
-address 192.48.96.9).  The most recent released version can always be found
-there in directory graphics/jpeg.  This particular version will be archived
-as ftp://ftp.uu.net/graphics/jpeg/jpegsrc.v6b.tar.gz.  If you don't have
-direct Internet access, UUNET's archives are also available via UUCP; contact
-help@uunet.uu.net for information on retrieving files that way.
-
-Numerous Internet sites maintain copies of the UUNET files.  However, only
-ftp.uu.net is guaranteed to have the latest official version.
-
-You can also obtain this software in DOS-compatible "zip" archive format from
-the SimTel archives (ftp://ftp.simtel.net/pub/simtelnet/msdos/graphics/), or
-on CompuServe in the Graphics Support forum (GO CIS:GRAPHSUP), library 12
-"JPEG Tools".  Again, these versions may sometimes lag behind the ftp.uu.net
-release.
-
-The JPEG FAQ (Frequently Asked Questions) article is a useful source of
-general information about JPEG.  It is updated constantly and therefore is
-not included in this distribution.  The FAQ is posted every two weeks to
-Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".  This particular version will be archived as
+http://www.ijg.org/files/jpegsrc.v8b.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr8b.zip.
+
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.
 It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
 and other news.answers archive sites, including the official news.answers
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
@@ -307,79 +268,59 @@ with body
 	send usenet/news.answers/jpeg-faq/part2
 
 
-RELATED SOFTWARE
-================
+ACKNOWLEDGMENTS
+===============
+
+Thank to Juergen Bruder for providing me with a copy of the common DCT
+algorithm article, only to find out that I had come to the same result
+in a more direct and comprehensible way with a more generative approach.
+
+Thank to Istvan Sebestyen and Joan L. Mitchell for inviting me to the
+ITU JPEG (Study Group 16) meeting in Geneva, Switzerland.
 
-Numerous viewing and image manipulation programs now support JPEG.  (Quite a
-few of them use this library to do so.)  The JPEG FAQ described above lists
-some of the more popular free and shareware viewers, and tells where to
-obtain them on Internet.
+Thank to Thomas Wiegand and Gary Sullivan for inviting me to the
+Joint Video Team (MPEG & ITU) meeting in Geneva, Switzerland.
 
-If you are on a Unix machine, we highly recommend Jef Poskanzer's free
-PBMPLUS software, which provides many useful operations on PPM-format image
-files.  In particular, it can convert PPM images to and from a wide range of
-other formats, thus making cjpeg/djpeg considerably more useful.  The latest
-version is distributed by the NetPBM group, and is available from numerous
-sites, notably ftp://wuarchive.wustl.edu/graphics/graphics/packages/NetPBM/.
-Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software is;
-you are likely to have difficulty making it work on any non-Unix machine.
+Thank to John Korejwa and Massimo Ballerini for inviting me to
+fruitful consultations in Boston, MA and Milan, Italy.
 
-A different free JPEG implementation, written by the PVRG group at Stanford,
-is available from ftp://havefun.stanford.edu/pub/jpeg/.  This program
-is designed for research and experimentation rather than production use;
-it is slower, harder to use, and less portable than the IJG code, but it
-is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
-which we do not.  (On the other hand, it doesn't do progressive JPEG.)
+Thank to Hendrik Elstner, Roland Fassauer, Simone Zuck, Guenther
+Maier-Gerber, Walter Stoeber, and Fred Schmitz for corresponding
+business development.
+
+Thank to Nico Zschach and Dirk Stelling of the technical support team
+at the Digital Images company in Halle for providing me with extra
+equipment for configuration tests.
+
+Thank to Richard F. Lyon (then of Foveon Inc.) for fruitful
+communication about JPEG configuration in Sigma Photo Pro software.
+
+Thank to Andrew Finkenstadt for hosting the ijg.org site.
+
+Last but not least special thank to Thomas G. Lane for the original
+design and development of this singular software package.
 
 
 FILE FORMAT WARS
 ================
 
-Some JPEG programs produce files that are not compatible with our library.
-The root of the problem is that the ISO JPEG committee failed to specify a
-concrete file format.  Some vendors "filled in the blanks" on their own,
-creating proprietary formats that no one else could read.  (For example, none
-of the early commercial JPEG implementations for the Macintosh were able to
-exchange compressed files.)
-
-The file format we have adopted is called JFIF (see REFERENCES).  This format
-has been agreed to by a number of major commercial JPEG vendors, and it has
-become the de facto standard.  JFIF is a minimal or "low end" representation.
-We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
-Technical Note #2) for "high end" applications that need to record a lot of
-additional data about an image.  TIFF/JPEG is fairly new and not yet widely
-supported, unfortunately.
-
-The upcoming JPEG Part 3 standard defines a file format called SPIFF.
-SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
-be able to read the most common variant of SPIFF.  SPIFF has some technical
-advantages over JFIF, but its major claim to fame is simply that it is an
-official standard rather than an informal one.  At this point it is unclear
-whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
-standard.  IJG intends to support SPIFF once the standard is frozen, but we
-have not decided whether it should become our default output format or not.
-(In any case, our decoder will remain capable of reading JFIF indefinitely.)
-
-Various proprietary file formats incorporating JPEG compression also exist.
-We have little or no sympathy for the existence of these formats.  Indeed,
+The ISO JPEG standards committee actually promotes different formats like
+"JPEG 2000" or "JPEG XR" which are incompatible with original DCT-based
+JPEG and which are based on faulty technologies.  IJG therefore does not
+and will not support such momentary mistakes (see REFERENCES).
+We have little or no sympathy for the promotion of these formats.  Indeed,
 one of the original reasons for developing this free software was to help
-force convergence on common, open format standards for JPEG files.  Don't
-use a proprietary file format!
+force convergence on common, interoperable format standards for JPEG files.
+Don't use an incompatible file format!
+(In any case, our decoder will remain capable of reading existing JPEG
+image files indefinitely.)
 
 
 TO DO
 =====
 
-The major thrust for v7 will probably be improvement of visual quality.
-The current method for scaling the quantization tables is known not to be
-very good at low Q values.  We also intend to investigate block boundary
-smoothing, "poor man's variable quantization", and other means of improving
-quality-vs-file-size performance without sacrificing compatibility.
-
-In future versions, we are considering supporting some of the upcoming JPEG
-Part 3 extensions --- principally, variable quantization and the SPIFF file
-format.
-
-As always, speeding things up is of great interest.
+Version 8 is the first release of a new generation JPEG standard
+to overcome the limitations of the original JPEG specification.
+More features are being prepared for coming releases...
 
-Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
+Please send bug reports, offers of help, etc. to jpeg-info@uc.ag.
diff --git a/jpeg/change.log b/jpeg/change.log
index 74102c0db..f99a867db 100644
--- a/jpeg/change.log
+++ b/jpeg/change.log
@@ -1,6 +1,104 @@
 CHANGE LOG for Independent JPEG Group's JPEG software
 
 
+Version 8b  16-May-2010
+-----------------------
+
+Repair problem in new memory source manager with corrupt JPEG data.
+Thank to Ted Campbell and Samuel Chun for the report.
+
+Repair problem in Makefile.am test target.
+Thank to anonymous user for the report.
+
+Support MinGW installation with automatic configure.
+Thank to Volker Grabsch for the suggestion.
+
+
+Version 8a  28-Feb-2010
+-----------------------
+
+Writing tables-only datastreams via jpeg_write_tables works again.
+
+Support 32-bit BMPs (RGB image with Alpha channel) for read in cjpeg.
+Thank to Brett Blackham for the suggestion.
+
+Improve accuracy in floating point IDCT calculation.
+Thank to Robert Hooke for the hint.
+
+
+Version 8  10-Jan-2010
+----------------------
+
+jpegtran now supports the same -scale option as djpeg for "lossless" resize.
+An implementation of the JPEG SmartScale extension is required for this
+feature.  A (draft) specification of the JPEG SmartScale extension is
+available as a contributed document at ITU and ISO.  Revision 2 or later
+of the document is required (latest document version is Revision 3).
+The SmartScale extension will enable more features beside lossless resize
+in future implementations, as described in the document (new compression
+options).
+
+Add sanity check in BMP reader module to avoid cjpeg crash for empty input
+image (thank to Isaev Ildar of ISP RAS, Moscow, RU for reporting this error).
+
+Add data source and destination managers for read from and write to
+memory buffers.  New API functions jpeg_mem_src and jpeg_mem_dest.
+Thank to Roberto Boni from Italy for the suggestion.
+
+
+Version 7  27-Jun-2009
+----------------------
+
+New scaled DCTs implemented.
+djpeg now supports scalings N/8 with all N from 1 to 16.
+cjpeg now supports scalings 8/N with all N from 1 to 16.
+Scaled DCTs with size larger than 8 are now also used for resolving the
+common 2x2 chroma subsampling case without additional spatial resampling.
+Separate spatial resampling for those kind of files is now only necessary
+for N>8 scaling cases.
+Furthermore, separate scaled DCT functions are provided for direct resolving
+of the common asymmetric subsampling cases (2x1 and 1x2) without additional
+spatial resampling.
+
+cjpeg -quality option has been extended for support of separate quality
+settings for luminance and chrominance (or in general, for every provided
+quantization table slot).
+New API function jpeg_default_qtables() and q_scale_factor array in library.
+
+Added -nosmooth option to cjpeg, complementary to djpeg.
+New variable "do_fancy_downsampling" in library, complement to fancy
+upsampling.  Fancy upsampling now uses direct DCT scaling with sizes
+larger than 8.  The old method is not reversible and has been removed.
+
+Support arithmetic entropy encoding and decoding.
+Added files jaricom.c, jcarith.c, jdarith.c.
+
+Straighten the file structure:
+Removed files jidctred.c, jcphuff.c, jchuff.h, jdphuff.c, jdhuff.h.
+
+jpegtran has a new "lossless" cropping feature.
+
+Implement -perfect option in jpegtran, new API function
+jtransform_perfect_transform() in transupp. (DP 204_perfect.dpatch)
+
+Better error messages for jpegtran fopen failure.
+(DP 203_jpegtran_errmsg.dpatch)
+
+Fix byte order issue with 16bit PPM/PGM files in rdppm.c/wrppm.c:
+according to Netpbm, the de facto standard implementation of the PNM formats,
+the most significant byte is first. (DP 203_rdppm.dpatch)
+
+Add -raw option to rdjpgcom not to mangle the output.
+(DP 205_rdjpgcom_raw.dpatch)
+
+Make rdjpgcom locale aware. (DP 201_rdjpgcom_locale.dpatch)
+
+Add extern "C" to jpeglib.h.
+This avoids the need to put extern "C" { ... } around #include "jpeglib.h"
+in your C++ application.  Defining the symbol DONT_USE_EXTERN_C in the
+configuration prevents this. (DP 202_jpeglib.h_c++.dpatch)
+
+
 Version 6b  27-Mar-1998
 -----------------------
 
diff --git a/jpeg/coderules.doc b/jpeg/coderules.doc
deleted file mode 100644
index 0ab5d9bd3..000000000
--- a/jpeg/coderules.doc
+++ /dev/null
@@ -1,118 +0,0 @@
-IJG JPEG LIBRARY:  CODING RULES
-
-Copyright (C) 1991-1996, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Since numerous people will be contributing code and bug fixes, it's important
-to establish a common coding style.  The goal of using similar coding styles
-is much more important than the details of just what that style is.
-
-In general we follow the recommendations of "Recommended C Style and Coding
-Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
-Brader).  This document is available in the IJG FTP archive (see
-jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
-
-Block comments should be laid out thusly:
-
-/*
- *  Block comments in this style.
- */
-
-We indent statements in K&R style, e.g.,
-	if (test) {
-	  then-part;
-	} else {
-	  else-part;
-	}
-with two spaces per indentation level.  (This indentation convention is
-handled automatically by GNU Emacs and many other text editors.)
-
-Multi-word names should be written in lower case with underscores, e.g.,
-multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
-are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
-the first fifteen characters.  (On some older systems, global names must be
-unique within six characters.  We accommodate this without cluttering the
-source code by using macros to substitute shorter names.)
-
-We use function prototypes everywhere; we rely on automatic source code
-transformation to feed prototype-less C compilers.  Transformation is done
-by the simple and portable tool 'ansi2knr.c' (courtesy of Ghostscript).
-ansi2knr is not very bright, so it imposes a format requirement on function
-declarations: the function name MUST BEGIN IN COLUMN 1.  Thus all functions
-should be written in the following style:
-
-LOCAL(int *)
-function_name (int a, char *b)
-{
-    code...
-}
-
-Note that each function definition must begin with GLOBAL(type), LOCAL(type),
-or METHODDEF(type).  These macros expand to "static type" or just "type" as
-appropriate.  They provide a readable indication of the routine's usage and
-can readily be changed for special needs.  (For instance, special linkage
-keywords can be inserted for use in Windows DLLs.)
-
-ansi2knr does not transform method declarations (function pointers in
-structs).  We handle these with a macro JMETHOD, defined as
-	#ifdef HAVE_PROTOTYPES
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
-	#else
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
-	#endif
-which is used like this:
-	struct function_pointers {
-	  JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp));
-	  JMETHOD(void, term_entropy_encoder, (void));
-	};
-Note the set of parentheses surrounding the parameter list.
-
-A similar solution is used for forward and external function declarations
-(see the EXTERN and JPP macros).
-
-If the code is to work on non-ANSI compilers, we cannot rely on a prototype
-declaration to coerce actual parameters into the right types.  Therefore, use
-explicit casts on actual parameters whenever the actual parameter type is not
-identical to the formal parameter.  Beware of implicit conversions to "int".
-
-It seems there are some non-ANSI compilers in which the sizeof() operator
-is defined to return int, yet size_t is defined as long.  Needless to say,
-this is brain-damaged.  Always use the SIZEOF() macro in place of sizeof(),
-so that the result is guaranteed to be of type size_t.
-
-
-The JPEG library is intended to be used within larger programs.  Furthermore,
-we want it to be reentrant so that it can be used by applications that process
-multiple images concurrently.  The following rules support these requirements:
-
-1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
-pass these through the common routines provided.
-
-2. Minimize global namespace pollution.  Functions should be declared static
-wherever possible.  (Note that our method-based calling conventions help this
-a lot: in many modules only the initialization function will ever need to be
-called directly, so only that function need be externally visible.)  All
-global function names should begin with "jpeg_", and should have an
-abbreviated name (unique in the first six characters) substituted by macro
-when NEED_SHORT_EXTERNAL_NAMES is set.
-
-3. Don't use global variables; anything that must be used in another module
-should be in the common data structures.
-
-4. Don't use static variables except for read-only constant tables.  Variables
-that should be private to a module can be placed into private structures (see
-the system architecture document, structure.doc).
-
-5. Source file names should begin with "j" for files that are part of the
-library proper; source files that are not part of the library, such as cjpeg.c
-and djpeg.c, do not begin with "j".  Keep source file names to eight
-characters (plus ".c" or ".h", etc) to make life easy for MS-DOSers.  Keep
-compression and decompression code in separate source files --- some
-applications may want only one half of the library.
-
-Note: these rules (particularly #4) are not followed religiously in the
-modules that are used in cjpeg/djpeg but are not part of the JPEG library
-proper.  Those modules are not really intended to be used in other
-applications.
diff --git a/jpeg/coderules.txt b/jpeg/coderules.txt
new file mode 100644
index 000000000..357929fb4
--- /dev/null
+++ b/jpeg/coderules.txt
@@ -0,0 +1,118 @@
+IJG JPEG LIBRARY:  CODING RULES
+
+Copyright (C) 1991-1996, Thomas G. Lane.
+This file is part of the Independent JPEG Group's software.
+For conditions of distribution and use, see the accompanying README file.
+
+
+Since numerous people will be contributing code and bug fixes, it's important
+to establish a common coding style.  The goal of using similar coding styles
+is much more important than the details of just what that style is.
+
+In general we follow the recommendations of "Recommended C Style and Coding
+Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
+Brader).  This document is available in the IJG FTP archive (see
+jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
+
+Block comments should be laid out thusly:
+
+/*
+ *  Block comments in this style.
+ */
+
+We indent statements in K&R style, e.g.,
+	if (test) {
+	  then-part;
+	} else {
+	  else-part;
+	}
+with two spaces per indentation level.  (This indentation convention is
+handled automatically by GNU Emacs and many other text editors.)
+
+Multi-word names should be written in lower case with underscores, e.g.,
+multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
+are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
+the first fifteen characters.  (On some older systems, global names must be
+unique within six characters.  We accommodate this without cluttering the
+source code by using macros to substitute shorter names.)
+
+We use function prototypes everywhere; we rely on automatic source code
+transformation to feed prototype-less C compilers.  Transformation is done
+by the simple and portable tool 'ansi2knr.c' (courtesy of Ghostscript).
+ansi2knr is not very bright, so it imposes a format requirement on function
+declarations: the function name MUST BEGIN IN COLUMN 1.  Thus all functions
+should be written in the following style:
+
+LOCAL(int *)
+function_name (int a, char *b)
+{
+    code...
+}
+
+Note that each function definition must begin with GLOBAL(type), LOCAL(type),
+or METHODDEF(type).  These macros expand to "static type" or just "type" as
+appropriate.  They provide a readable indication of the routine's usage and
+can readily be changed for special needs.  (For instance, special linkage
+keywords can be inserted for use in Windows DLLs.)
+
+ansi2knr does not transform method declarations (function pointers in
+structs).  We handle these with a macro JMETHOD, defined as
+	#ifdef HAVE_PROTOTYPES
+	#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+	#else
+	#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
+	#endif
+which is used like this:
+	struct function_pointers {
+	  JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp));
+	  JMETHOD(void, term_entropy_encoder, (void));
+	};
+Note the set of parentheses surrounding the parameter list.
+
+A similar solution is used for forward and external function declarations
+(see the EXTERN and JPP macros).
+
+If the code is to work on non-ANSI compilers, we cannot rely on a prototype
+declaration to coerce actual parameters into the right types.  Therefore, use
+explicit casts on actual parameters whenever the actual parameter type is not
+identical to the formal parameter.  Beware of implicit conversions to "int".
+
+It seems there are some non-ANSI compilers in which the sizeof() operator
+is defined to return int, yet size_t is defined as long.  Needless to say,
+this is brain-damaged.  Always use the SIZEOF() macro in place of sizeof(),
+so that the result is guaranteed to be of type size_t.
+
+
+The JPEG library is intended to be used within larger programs.  Furthermore,
+we want it to be reentrant so that it can be used by applications that process
+multiple images concurrently.  The following rules support these requirements:
+
+1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
+pass these through the common routines provided.
+
+2. Minimize global namespace pollution.  Functions should be declared static
+wherever possible.  (Note that our method-based calling conventions help this
+a lot: in many modules only the initialization function will ever need to be
+called directly, so only that function need be externally visible.)  All
+global function names should begin with "jpeg_", and should have an
+abbreviated name (unique in the first six characters) substituted by macro
+when NEED_SHORT_EXTERNAL_NAMES is set.
+
+3. Don't use global variables; anything that must be used in another module
+should be in the common data structures.
+
+4. Don't use static variables except for read-only constant tables.  Variables
+that should be private to a module can be placed into private structures (see
+the system architecture document, structure.txt).
+
+5. Source file names should begin with "j" for files that are part of the
+library proper; source files that are not part of the library, such as cjpeg.c
+and djpeg.c, do not begin with "j".  Keep source file names to eight
+characters (plus ".c" or ".h", etc) to make life easy for MS-DOSers.  Keep
+compression and decompression code in separate source files --- some
+applications may want only one half of the library.
+
+Note: these rules (particularly #4) are not followed religiously in the
+modules that are used in cjpeg/djpeg but are not part of the JPEG library
+proper.  Those modules are not really intended to be used in other
+applications.
diff --git a/jpeg/filelist.doc b/jpeg/filelist.doc
deleted file mode 100644
index e14982ca5..000000000
--- a/jpeg/filelist.doc
+++ /dev/null
@@ -1,210 +0,0 @@
-IJG JPEG LIBRARY:  FILE LIST
-
-Copyright (C) 1994-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Here is a road map to the files in the IJG JPEG distribution.  The
-distribution includes the JPEG library proper, plus two application
-programs ("cjpeg" and "djpeg") which use the library to convert JPEG
-files to and from some other popular image formats.  A third application
-"jpegtran" uses the library to do lossless conversion between different
-variants of JPEG.  There are also two stand-alone applications,
-"rdjpgcom" and "wrjpgcom".
-
-
-THE JPEG LIBRARY
-================
-
-Include files:
-
-jpeglib.h	JPEG library's exported data and function declarations.
-jconfig.h	Configuration declarations.  Note: this file is not present
-		in the distribution; it is generated during installation.
-jmorecfg.h	Additional configuration declarations; need not be changed
-		for a standard installation.
-jerror.h	Declares JPEG library's error and trace message codes.
-jinclude.h	Central include file used by all IJG .c files to reference
-		system include files.
-jpegint.h	JPEG library's internal data structures.
-jchuff.h	Private declarations for Huffman encoder modules.
-jdhuff.h	Private declarations for Huffman decoder modules.
-jdct.h		Private declarations for forward & reverse DCT subsystems.
-jmemsys.h	Private declarations for memory management subsystem.
-jversion.h	Version information.
-
-Applications using the library should include jpeglib.h (which in turn
-includes jconfig.h and jmorecfg.h).  Optionally, jerror.h may be included
-if the application needs to reference individual JPEG error codes.  The
-other include files are intended for internal use and would not normally
-be included by an application program.  (cjpeg/djpeg/etc do use jinclude.h,
-since its function is to improve portability of the whole IJG distribution.
-Most other applications will directly include the system include files they
-want, and hence won't need jinclude.h.)
-
-
-C source code files:
-
-These files contain most of the functions intended to be called directly by
-an application program:
-
-jcapimin.c	Application program interface: core routines for compression.
-jcapistd.c	Application program interface: standard compression.
-jdapimin.c	Application program interface: core routines for decompression.
-jdapistd.c	Application program interface: standard decompression.
-jcomapi.c	Application program interface routines common to compression
-		and decompression.
-jcparam.c	Compression parameter setting helper routines.
-jctrans.c	API and library routines for transcoding compression.
-jdtrans.c	API and library routines for transcoding decompression.
-
-Compression side of the library:
-
-jcinit.c	Initialization: determines which other modules to use.
-jcmaster.c	Master control: setup and inter-pass sequencing logic.
-jcmainct.c	Main buffer controller (preprocessor => JPEG compressor).
-jcprepct.c	Preprocessor buffer controller.
-jccoefct.c	Buffer controller for DCT coefficient buffer.
-jccolor.c	Color space conversion.
-jcsample.c	Downsampling.
-jcdctmgr.c	DCT manager (DCT implementation selection & control).
-jfdctint.c	Forward DCT using slow-but-accurate integer method.
-jfdctfst.c	Forward DCT using faster, less accurate integer method.
-jfdctflt.c	Forward DCT using floating-point arithmetic.
-jchuff.c	Huffman entropy coding for sequential JPEG.
-jcphuff.c	Huffman entropy coding for progressive JPEG.
-jcmarker.c	JPEG marker writing.
-jdatadst.c	Data destination manager for stdio output.
-
-Decompression side of the library:
-
-jdmaster.c	Master control: determines which other modules to use.
-jdinput.c	Input controller: controls input processing modules.
-jdmainct.c	Main buffer controller (JPEG decompressor => postprocessor).
-jdcoefct.c	Buffer controller for DCT coefficient buffer.
-jdpostct.c	Postprocessor buffer controller.
-jdmarker.c	JPEG marker reading.
-jdhuff.c	Huffman entropy decoding for sequential JPEG.
-jdphuff.c	Huffman entropy decoding for progressive JPEG.
-jddctmgr.c	IDCT manager (IDCT implementation selection & control).
-jidctint.c	Inverse DCT using slow-but-accurate integer method.
-jidctfst.c	Inverse DCT using faster, less accurate integer method.
-jidctflt.c	Inverse DCT using floating-point arithmetic.
-jidctred.c	Inverse DCTs with reduced-size outputs.
-jdsample.c	Upsampling.
-jdcolor.c	Color space conversion.
-jdmerge.c	Merged upsampling/color conversion (faster, lower quality).
-jquant1.c	One-pass color quantization using a fixed-spacing colormap.
-jquant2.c	Two-pass color quantization using a custom-generated colormap.
-		Also handles one-pass quantization to an externally given map.
-jdatasrc.c	Data source manager for stdio input.
-
-Support files for both compression and decompression:
-
-jerror.c	Standard error handling routines (application replaceable).
-jmemmgr.c	System-independent (more or less) memory management code.
-jutils.c	Miscellaneous utility routines.
-
-jmemmgr.c relies on a system-dependent memory management module.  The IJG
-distribution includes the following implementations of the system-dependent
-module:
-
-jmemnobs.c	"No backing store": assumes adequate virtual memory exists.
-jmemansi.c	Makes temporary files with ANSI-standard routine tmpfile().
-jmemname.c	Makes temporary files with program-generated file names.
-jmemdos.c	Custom implementation for MS-DOS (16-bit environment only):
-		can use extended and expanded memory as well as temp files.
-jmemmac.c	Custom implementation for Apple Macintosh.
-
-Exactly one of the system-dependent modules should be configured into an
-installed JPEG library (see install.doc for hints about which one to use).
-On unusual systems you may find it worthwhile to make a special
-system-dependent memory manager.
-
-
-Non-C source code files:
-
-jmemdosa.asm	80x86 assembly code support for jmemdos.c; used only in
-		MS-DOS-specific configurations of the JPEG library.
-
-
-CJPEG/DJPEG/JPEGTRAN
-====================
-
-Include files:
-
-cdjpeg.h	Declarations shared by cjpeg/djpeg/jpegtran modules.
-cderror.h	Additional error and trace message codes for cjpeg et al.
-transupp.h	Declarations for jpegtran support routines in transupp.c.
-
-C source code files:
-
-cjpeg.c		Main program for cjpeg.
-djpeg.c		Main program for djpeg.
-jpegtran.c	Main program for jpegtran.
-cdjpeg.c	Utility routines used by all three programs.
-rdcolmap.c	Code to read a colormap file for djpeg's "-map" switch.
-rdswitch.c	Code to process some of cjpeg's more complex switches.
-		Also used by jpegtran.
-transupp.c	Support code for jpegtran: lossless image manipulations.
-
-Image file reader modules for cjpeg:
-
-rdbmp.c		BMP file input.
-rdgif.c		GIF file input (now just a stub).
-rdppm.c		PPM/PGM file input.
-rdrle.c		Utah RLE file input.
-rdtarga.c	Targa file input.
-
-Image file writer modules for djpeg:
-
-wrbmp.c		BMP file output.
-wrgif.c		GIF file output (a mere shadow of its former self).
-wrppm.c		PPM/PGM file output.
-wrrle.c		Utah RLE file output.
-wrtarga.c	Targa file output.
-
-
-RDJPGCOM/WRJPGCOM
-=================
-
-C source code files:
-
-rdjpgcom.c	Stand-alone rdjpgcom application.
-wrjpgcom.c	Stand-alone wrjpgcom application.
-
-These programs do not depend on the IJG library.  They do use
-jconfig.h and jinclude.h, only to improve portability.
-
-
-ADDITIONAL FILES
-================
-
-Documentation (see README for a guide to the documentation files):
-
-README		Master documentation file.
-*.doc		Other documentation files.
-*.1		Documentation in Unix man page format.
-change.log	Version-to-version change highlights.
-example.c	Sample code for calling JPEG library.
-
-Configuration/installation files and programs (see install.doc for more info):
-
-configure	Unix shell script to perform automatic configuration.
-ltconfig	Support scripts for configure (from GNU libtool).
-ltmain.sh
-config.guess
-config.sub
-install-sh	Install shell script for those Unix systems lacking one.
-ckconfig.c	Program to generate jconfig.h on non-Unix systems.
-jconfig.doc	Template for making jconfig.h by hand.
-makefile.*	Sample makefiles for particular systems.
-jconfig.*	Sample jconfig.h for particular systems.
-ansi2knr.c	De-ANSIfier for pre-ANSI C compilers (courtesy of
-		L. Peter Deutsch and Aladdin Enterprises).
-
-Test files (see install.doc for test procedure):
-
-test*.*		Source and comparison files for confidence test.
-		These are binary image files, NOT text files.
diff --git a/jpeg/filelist.txt b/jpeg/filelist.txt
new file mode 100644
index 000000000..7e053869a
--- /dev/null
+++ b/jpeg/filelist.txt
@@ -0,0 +1,215 @@
+IJG JPEG LIBRARY:  FILE LIST
+
+Copyright (C) 1994-2009, Thomas G. Lane, Guido Vollbeding.
+This file is part of the Independent JPEG Group's software.
+For conditions of distribution and use, see the accompanying README file.
+
+
+Here is a road map to the files in the IJG JPEG distribution.  The
+distribution includes the JPEG library proper, plus two application
+programs ("cjpeg" and "djpeg") which use the library to convert JPEG
+files to and from some other popular image formats.  A third application
+"jpegtran" uses the library to do lossless conversion between different
+variants of JPEG.  There are also two stand-alone applications,
+"rdjpgcom" and "wrjpgcom".
+
+
+THE JPEG LIBRARY
+================
+
+Include files:
+
+jpeglib.h	JPEG library's exported data and function declarations.
+jconfig.h	Configuration declarations.  Note: this file is not present
+		in the distribution; it is generated during installation.
+jmorecfg.h	Additional configuration declarations; need not be changed
+		for a standard installation.
+jerror.h	Declares JPEG library's error and trace message codes.
+jinclude.h	Central include file used by all IJG .c files to reference
+		system include files.
+jpegint.h	JPEG library's internal data structures.
+jdct.h		Private declarations for forward & reverse DCT subsystems.
+jmemsys.h	Private declarations for memory management subsystem.
+jversion.h	Version information.
+
+Applications using the library should include jpeglib.h (which in turn
+includes jconfig.h and jmorecfg.h).  Optionally, jerror.h may be included
+if the application needs to reference individual JPEG error codes.  The
+other include files are intended for internal use and would not normally
+be included by an application program.  (cjpeg/djpeg/etc do use jinclude.h,
+since its function is to improve portability of the whole IJG distribution.
+Most other applications will directly include the system include files they
+want, and hence won't need jinclude.h.)
+
+
+C source code files:
+
+These files contain most of the functions intended to be called directly by
+an application program:
+
+jcapimin.c	Application program interface: core routines for compression.
+jcapistd.c	Application program interface: standard compression.
+jdapimin.c	Application program interface: core routines for decompression.
+jdapistd.c	Application program interface: standard decompression.
+jcomapi.c	Application program interface routines common to compression
+		and decompression.
+jcparam.c	Compression parameter setting helper routines.
+jctrans.c	API and library routines for transcoding compression.
+jdtrans.c	API and library routines for transcoding decompression.
+
+Compression side of the library:
+
+jcinit.c	Initialization: determines which other modules to use.
+jcmaster.c	Master control: setup and inter-pass sequencing logic.
+jcmainct.c	Main buffer controller (preprocessor => JPEG compressor).
+jcprepct.c	Preprocessor buffer controller.
+jccoefct.c	Buffer controller for DCT coefficient buffer.
+jccolor.c	Color space conversion.
+jcsample.c	Downsampling.
+jcdctmgr.c	DCT manager (DCT implementation selection & control).
+jfdctint.c	Forward DCT using slow-but-accurate integer method.
+jfdctfst.c	Forward DCT using faster, less accurate integer method.
+jfdctflt.c	Forward DCT using floating-point arithmetic.
+jchuff.c	Huffman entropy coding.
+jcarith.c	Arithmetic entropy coding.
+jcmarker.c	JPEG marker writing.
+jdatadst.c	Data destination managers for memory and stdio output.
+
+Decompression side of the library:
+
+jdmaster.c	Master control: determines which other modules to use.
+jdinput.c	Input controller: controls input processing modules.
+jdmainct.c	Main buffer controller (JPEG decompressor => postprocessor).
+jdcoefct.c	Buffer controller for DCT coefficient buffer.
+jdpostct.c	Postprocessor buffer controller.
+jdmarker.c	JPEG marker reading.
+jdhuff.c	Huffman entropy decoding.
+jdarith.c	Arithmetic entropy decoding.
+jddctmgr.c	IDCT manager (IDCT implementation selection & control).
+jidctint.c	Inverse DCT using slow-but-accurate integer method.
+jidctfst.c	Inverse DCT using faster, less accurate integer method.
+jidctflt.c	Inverse DCT using floating-point arithmetic.
+jdsample.c	Upsampling.
+jdcolor.c	Color space conversion.
+jdmerge.c	Merged upsampling/color conversion (faster, lower quality).
+jquant1.c	One-pass color quantization using a fixed-spacing colormap.
+jquant2.c	Two-pass color quantization using a custom-generated colormap.
+		Also handles one-pass quantization to an externally given map.
+jdatasrc.c	Data source managers for memory and stdio input.
+
+Support files for both compression and decompression:
+
+jaricom.c	Tables for common use in arithmetic entropy encoding and
+		decoding routines.
+jerror.c	Standard error handling routines (application replaceable).
+jmemmgr.c	System-independent (more or less) memory management code.
+jutils.c	Miscellaneous utility routines.
+
+jmemmgr.c relies on a system-dependent memory management module.  The IJG
+distribution includes the following implementations of the system-dependent
+module:
+
+jmemnobs.c	"No backing store": assumes adequate virtual memory exists.
+jmemansi.c	Makes temporary files with ANSI-standard routine tmpfile().
+jmemname.c	Makes temporary files with program-generated file names.
+jmemdos.c	Custom implementation for MS-DOS (16-bit environment only):
+		can use extended and expanded memory as well as temp files.
+jmemmac.c	Custom implementation for Apple Macintosh.
+
+Exactly one of the system-dependent modules should be configured into an
+installed JPEG library (see install.txt for hints about which one to use).
+On unusual systems you may find it worthwhile to make a special
+system-dependent memory manager.
+
+
+Non-C source code files:
+
+jmemdosa.asm	80x86 assembly code support for jmemdos.c; used only in
+		MS-DOS-specific configurations of the JPEG library.
+
+
+CJPEG/DJPEG/JPEGTRAN
+====================
+
+Include files:
+
+cdjpeg.h	Declarations shared by cjpeg/djpeg/jpegtran modules.
+cderror.h	Additional error and trace message codes for cjpeg et al.
+transupp.h	Declarations for jpegtran support routines in transupp.c.
+
+C source code files:
+
+cjpeg.c		Main program for cjpeg.
+djpeg.c		Main program for djpeg.
+jpegtran.c	Main program for jpegtran.
+cdjpeg.c	Utility routines used by all three programs.
+rdcolmap.c	Code to read a colormap file for djpeg's "-map" switch.
+rdswitch.c	Code to process some of cjpeg's more complex switches.
+		Also used by jpegtran.
+transupp.c	Support code for jpegtran: lossless image manipulations.
+
+Image file reader modules for cjpeg:
+
+rdbmp.c		BMP file input.
+rdgif.c		GIF file input (now just a stub).
+rdppm.c		PPM/PGM file input.
+rdrle.c		Utah RLE file input.
+rdtarga.c	Targa file input.
+
+Image file writer modules for djpeg:
+
+wrbmp.c		BMP file output.
+wrgif.c		GIF file output (a mere shadow of its former self).
+wrppm.c		PPM/PGM file output.
+wrrle.c		Utah RLE file output.
+wrtarga.c	Targa file output.
+
+
+RDJPGCOM/WRJPGCOM
+=================
+
+C source code files:
+
+rdjpgcom.c	Stand-alone rdjpgcom application.
+wrjpgcom.c	Stand-alone wrjpgcom application.
+
+These programs do not depend on the IJG library.  They do use
+jconfig.h and jinclude.h, only to improve portability.
+
+
+ADDITIONAL FILES
+================
+
+Documentation (see README for a guide to the documentation files):
+
+README		Master documentation file.
+*.txt		Other documentation files.
+*.1		Documentation in Unix man page format.
+change.log	Version-to-version change highlights.
+example.c	Sample code for calling JPEG library.
+
+Configuration/installation files and programs (see install.txt for more info):
+
+configure	Unix shell script to perform automatic configuration.
+configure.ac	Source file for use with Autoconf to generate configure.
+ltmain.sh	Support scripts for configure (from GNU libtool).
+config.guess
+config.sub
+depcomp
+missing
+install-sh	Install shell script for those Unix systems lacking one.
+Makefile.in	Makefile input for configure.
+Makefile.am	Source file for use with Automake to generate Makefile.in.
+ckconfig.c	Program to generate jconfig.h on non-Unix systems.
+jconfig.txt	Template for making jconfig.h by hand.
+mak*.*		Sample makefiles for particular systems.
+jconfig.*	Sample jconfig.h for particular systems.
+libjpeg.map	Script to generate shared library with versioned symbols.
+aclocal.m4	M4 macro definitions for use with Autoconf.
+ansi2knr.c	De-ANSIfier for pre-ANSI C compilers (courtesy of
+		L. Peter Deutsch and Aladdin Enterprises).
+
+Test files (see install.txt for test procedure):
+
+test*.*		Source and comparison files for confidence test.
+		These are binary image files, NOT text files.
diff --git a/jpeg/install.txt b/jpeg/install.txt
new file mode 100644
index 000000000..2ee86adf4
--- /dev/null
+++ b/jpeg/install.txt
@@ -0,0 +1,1096 @@
+INSTALLATION INSTRUCTIONS for the Independent JPEG Group's JPEG software
+
+Copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+This file is part of the Independent JPEG Group's software.
+For conditions of distribution and use, see the accompanying README file.
+
+
+This file explains how to configure and install the IJG software.  We have
+tried to make this software extremely portable and flexible, so that it can be
+adapted to almost any environment.  The downside of this decision is that the
+installation process is complicated.  We have provided shortcuts to simplify
+the task on common systems.  But in any case, you will need at least a little
+familiarity with C programming and program build procedures for your system.
+
+If you are only using this software as part of a larger program, the larger
+program's installation procedure may take care of configuring the IJG code.
+For example, Ghostscript's installation script will configure the IJG code.
+You don't need to read this file if you just want to compile Ghostscript.
+
+If you are on a Unix machine, you may not need to read this file at all.
+Try doing
+	./configure
+	make
+	make test
+If that doesn't complain, do
+	make install
+(better do "make -n install" first to see if the makefile will put the files
+where you want them).  Read further if you run into snags or want to customize
+the code for your system.
+
+
+TABLE OF CONTENTS
+-----------------
+
+Before you start
+Configuring the software:
+	using the automatic "configure" script
+	using one of the supplied jconfig and makefile files
+	by hand
+Building the software
+Testing the software
+Installing the software
+Optional stuff
+Optimization
+Hints for specific systems
+
+
+BEFORE YOU START
+================
+
+Before installing the software you must unpack the distributed source code.
+Since you are reading this file, you have probably already succeeded in this
+task.  However, there is a potential for error if you needed to convert the
+files to the local standard text file format (for example, if you are on
+MS-DOS you may have converted LF end-of-line to CR/LF).  You must apply
+such conversion to all the files EXCEPT those whose names begin with "test".
+The test files contain binary data; if you change them in any way then the
+self-test will give bad results.
+
+Please check the last section of this file to see if there are hints for the
+specific machine or compiler you are using.
+
+
+CONFIGURING THE SOFTWARE
+========================
+
+To configure the IJG code for your system, you need to create two files:
+  * jconfig.h: contains values for system-dependent #define symbols.
+  * Makefile: controls the compilation process.
+(On a non-Unix machine, you may create "project files" or some other
+substitute for a Makefile.  jconfig.h is needed in any environment.)
+
+We provide three different ways to generate these files:
+  * On a Unix system, you can just run the "configure" script.
+  * We provide sample jconfig files and makefiles for popular machines;
+    if your machine matches one of the samples, just copy the right sample
+    files to jconfig.h and Makefile.
+  * If all else fails, read the instructions below and make your own files.
+
+
+Configuring the software using the automatic "configure" script
+---------------------------------------------------------------
+
+If you are on a Unix machine, you can just type
+	./configure
+and let the configure script construct appropriate configuration files.
+If you're using "csh" on an old version of System V, you might need to type
+	sh configure
+instead to prevent csh from trying to execute configure itself.
+Expect configure to run for a few minutes, particularly on slower machines;
+it works by compiling a series of test programs.
+
+Configure was created with GNU Autoconf and it follows the usual conventions
+for GNU configure scripts.  It makes a few assumptions that you may want to
+override.  You can do this by providing optional switches to configure:
+
+* Configure will build both static and shared libraries, if possible.
+If you want to build libjpeg only as a static library, say
+	./configure --disable-shared
+If you want to build libjpeg only as a shared library, say
+	./configure --disable-static
+Configure uses GNU libtool to take care of system-dependent shared library
+building methods.
+
+* Configure will use gcc (GNU C compiler) if it's available, otherwise cc.
+To force a particular compiler to be selected, use the CC option, for example
+	./configure CC='cc'
+The same method can be used to include any unusual compiler switches.
+For example, on HP-UX you probably want to say
+	./configure CC='cc -Aa'
+to get HP's compiler to run in ANSI mode.
+
+* The default CFLAGS setting is "-g" for non-gcc compilers, "-g -O2" for gcc.
+You can override this by saying, for example,
+	./configure CFLAGS='-O2'
+if you want to compile without debugging support.
+
+* Configure will set up the makefile so that "make install" will install files
+into /usr/local/bin, /usr/local/man, etc.  You can specify an installation
+prefix other than "/usr/local" by giving configure the option "--prefix=PATH".
+
+* If you don't have a lot of swap space, you may need to enable the IJG
+software's internal virtual memory mechanism.  To do this, give the option
+"--enable-maxmem=N" where N is the default maxmemory limit in megabytes.
+This is discussed in more detail under "Selecting a memory manager", below.
+You probably don't need to worry about this on reasonably-sized Unix machines,
+unless you plan to process very large images.
+
+Configure has some other features that are useful if you are cross-compiling
+or working in a network of multiple machine types; but if you need those
+features, you probably already know how to use them.
+
+
+Configuring the software using one of the supplied jconfig and makefile files
+-----------------------------------------------------------------------------
+
+If you have one of these systems, you can just use the provided configuration
+files:
+
+Makefile	jconfig file	System and/or compiler
+
+makefile.manx	jconfig.manx	Amiga, Manx Aztec C
+makefile.sas	jconfig.sas	Amiga, SAS C
+makeproj.mac	jconfig.mac	Apple Macintosh, Metrowerks CodeWarrior
+mak*jpeg.st	jconfig.st	Atari ST/STE/TT, Pure C or Turbo C
+makefile.bcc	jconfig.bcc	MS-DOS or OS/2, Borland C
+makefile.dj	jconfig.dj	MS-DOS, DJGPP (Delorie's port of GNU C)
+makefile.mc6	jconfig.mc6	MS-DOS, Microsoft C (16-bit only)
+makefile.wat	jconfig.wat	MS-DOS, OS/2, or Windows NT, Watcom C
+makefile.vc	jconfig.vc	Windows NT/95, MS Visual C++
+make*.vc6	jconfig.vc	Windows NT/95, MS Visual C++ 6
+make*.v10	jconfig.vc	Windows NT/95, MS Visual C++ 2010 (v10)
+makefile.mms	jconfig.vms	Digital VMS, with MMS software
+makefile.vms	jconfig.vms	Digital VMS, without MMS software
+
+Copy the proper jconfig file to jconfig.h and the makefile to Makefile (or
+whatever your system uses as the standard makefile name).  For more info see
+the appropriate system-specific hints section near the end of this file.
+
+
+Configuring the software by hand
+--------------------------------
+
+First, generate a jconfig.h file.  If you are moderately familiar with C,
+the comments in jconfig.txt should be enough information to do this; just
+copy jconfig.txt to jconfig.h and edit it appropriately.  Otherwise, you may
+prefer to use the ckconfig.c program.  You will need to compile and execute
+ckconfig.c by hand --- we hope you know at least enough to do that.
+ckconfig.c may not compile the first try (in fact, the whole idea is for it
+to fail if anything is going to).  If you get compile errors, fix them by
+editing ckconfig.c according to the directions given in ckconfig.c.  Once
+you get it to run, it will write a suitable jconfig.h file, and will also
+print out some advice about which makefile to use.
+
+You may also want to look at the canned jconfig files, if there is one for a
+system similar to yours.
+
+Second, select a makefile and copy it to Makefile (or whatever your system
+uses as the standard makefile name).  The most generic makefiles we provide
+are
+	makefile.ansi:	if your C compiler supports function prototypes
+	makefile.unix:	if not.
+(You have function prototypes if ckconfig.c put "#define HAVE_PROTOTYPES"
+in jconfig.h.)  You may want to start from one of the other makefiles if
+there is one for a system similar to yours.
+
+Look over the selected Makefile and adjust options as needed.  In particular
+you may want to change the CC and CFLAGS definitions.  For instance, if you
+are using GCC, set CC=gcc.  If you had to use any compiler switches to get
+ckconfig.c to work, make sure the same switches are in CFLAGS.
+
+If you are on a system that doesn't use makefiles, you'll need to set up
+project files (or whatever you do use) to compile all the source files and
+link them into executable files cjpeg, djpeg, jpegtran, rdjpgcom, and wrjpgcom.
+See the file lists in any of the makefiles to find out which files go into
+each program.  Note that the provided makefiles all make a "library" file
+libjpeg first, but you don't have to do that if you don't want to; the file
+lists identify which source files are actually needed for compression,
+decompression, or both.  As a last resort, you can make a batch script that
+just compiles everything and links it all together; makefile.vms is an example
+of this (it's for VMS systems that have no make-like utility).
+
+Here are comments about some specific configuration decisions you'll
+need to make:
+
+Command line style
+------------------
+
+These programs can use a Unix-like command line style which supports
+redirection and piping, like this:
+	cjpeg inputfile >outputfile
+	cjpeg <inputfile >outputfile
+	source program | cjpeg >outputfile
+The simpler "two file" command line style is just
+	cjpeg inputfile outputfile
+You may prefer the two-file style, particularly if you don't have pipes.
+
+You MUST use two-file style on any system that doesn't cope well with binary
+data fed through stdin/stdout; this is true for some MS-DOS compilers, for
+example.  If you're not on a Unix system, it's safest to assume you need
+two-file style.  (But if your compiler provides either the Posix-standard
+fdopen() library routine or a Microsoft-compatible setmode() routine, you
+can safely use the Unix command line style, by defining USE_FDOPEN or
+USE_SETMODE respectively.)
+
+To use the two-file style, make jconfig.h say "#define TWO_FILE_COMMANDLINE".
+
+Selecting a memory manager
+--------------------------
+
+The IJG code is capable of working on images that are too big to fit in main
+memory; data is swapped out to temporary files as necessary.  However, the
+code to do this is rather system-dependent.  We provide five different
+memory managers:
+
+* jmemansi.c	This version uses the ANSI-standard library routine tmpfile(),
+		which not all non-ANSI systems have.  On some systems
+		tmpfile() may put the temporary file in a non-optimal
+		location; if you don't like what it does, use jmemname.c.
+
+* jmemname.c	This version creates named temporary files.  For anything
+		except a Unix machine, you'll need to configure the
+		select_file_name() routine appropriately; see the comments
+		near the head of jmemname.c.  If you use this version, define
+		NEED_SIGNAL_CATCHER in jconfig.h to make sure the temp files
+		are removed if the program is aborted.
+
+* jmemnobs.c	(That stands for No Backing Store :-).)  This will compile on
+		almost any system, but it assumes you have enough main memory
+		or virtual memory to hold the biggest images you work with.
+
+* jmemdos.c	This should be used with most 16-bit MS-DOS compilers.
+		See the system-specific notes about MS-DOS for more info.
+		IMPORTANT: if you use this, define USE_MSDOS_MEMMGR in
+		jconfig.h, and include the assembly file jmemdosa.asm in the
+		programs.  The supplied makefiles and jconfig files for
+		16-bit MS-DOS compilers already do both.
+
+* jmemmac.c	Custom version for Apple Macintosh; see the system-specific
+		notes for Macintosh for more info.
+
+To use a particular memory manager, change the SYSDEPMEM variable in your
+makefile to equal the corresponding object file name (for example, jmemansi.o
+or jmemansi.obj for jmemansi.c).
+
+If you have plenty of (real or virtual) main memory, just use jmemnobs.c.
+"Plenty" means about ten bytes for every pixel in the largest images
+you plan to process, so a lot of systems don't meet this criterion.
+If yours doesn't, try jmemansi.c first.  If that doesn't compile, you'll have
+to use jmemname.c; be sure to adjust select_file_name() for local conditions.
+You may also need to change unlink() to remove() in close_backing_store().
+
+Except with jmemnobs.c or jmemmac.c, you need to adjust the DEFAULT_MAX_MEM
+setting to a reasonable value for your system (either by adding a #define for
+DEFAULT_MAX_MEM to jconfig.h, or by adding a -D switch to the Makefile).
+This value limits the amount of data space the program will attempt to
+allocate.  Code and static data space isn't counted, so the actual memory
+needs for cjpeg or djpeg are typically 100 to 150Kb more than the max-memory
+setting.  Larger max-memory settings reduce the amount of I/O needed to
+process a large image, but too large a value can result in "insufficient
+memory" failures.  On most Unix machines (and other systems with virtual
+memory), just set DEFAULT_MAX_MEM to several million and forget it.  At the
+other end of the spectrum, for MS-DOS machines you probably can't go much
+above 300K to 400K.  (On MS-DOS the value refers to conventional memory only.
+Extended/expanded memory is handled separately by jmemdos.c.)
+
+
+BUILDING THE SOFTWARE
+=====================
+
+Now you should be able to compile the software.  Just say "make" (or
+whatever's necessary to start the compilation).  Have a cup of coffee.
+
+Here are some things that could go wrong:
+
+If your compiler complains about undefined structures, you should be able to
+shut it up by putting "#define INCOMPLETE_TYPES_BROKEN" in jconfig.h.
+
+If you have trouble with missing system include files or inclusion of the
+wrong ones, read jinclude.h.  This shouldn't happen if you used configure
+or ckconfig.c to set up jconfig.h.
+
+There are a fair number of routines that do not use all of their parameters;
+some compilers will issue warnings about this, which you can ignore.  There
+are also a few configuration checks that may give "unreachable code" warnings.
+Any other warning deserves investigation.
+
+If you don't have a getenv() library routine, define NO_GETENV.
+
+Also see the system-specific hints, below.
+
+
+TESTING THE SOFTWARE
+====================
+
+As a quick test of functionality we've included a small sample image in
+several forms:
+	testorig.jpg	Starting point for the djpeg tests.
+	testimg.ppm	The output of djpeg testorig.jpg
+	testimg.bmp	The output of djpeg -bmp -colors 256 testorig.jpg
+	testimg.jpg	The output of cjpeg testimg.ppm
+	testprog.jpg	Progressive-mode equivalent of testorig.jpg.
+	testimgp.jpg	The output of cjpeg -progressive -optimize testimg.ppm
+(The first- and second-generation .jpg files aren't identical since JPEG is
+lossy.)  If you can generate duplicates of the testimg* files then you
+probably have working programs.
+
+With most of the makefiles, "make test" will perform the necessary
+comparisons.
+
+If you're using a makefile that doesn't provide the test option, run djpeg
+and cjpeg by hand and compare the output files to testimg* with whatever
+binary file comparison tool you have.  The files should be bit-for-bit
+identical.
+
+If the programs complain "MAX_ALLOC_CHUNK is wrong, please fix", then you
+need to reduce MAX_ALLOC_CHUNK to a value that fits in type size_t.
+Try adding "#define MAX_ALLOC_CHUNK 65520L" to jconfig.h.  A less likely
+configuration error is "ALIGN_TYPE is wrong, please fix": defining ALIGN_TYPE
+as long should take care of that one.
+
+If the cjpeg test run fails with "Missing Huffman code table entry", it's a
+good bet that you needed to define RIGHT_SHIFT_IS_UNSIGNED.  Go back to the
+configuration step and run ckconfig.c.  (This is a good plan for any other
+test failure, too.)
+
+If you are using Unix (one-file) command line style on a non-Unix system,
+it's a good idea to check that binary I/O through stdin/stdout actually
+works.  You should get the same results from "djpeg <testorig.jpg >out.ppm"
+as from "djpeg -outfile out.ppm testorig.jpg".  Note that the makefiles all
+use the latter style and therefore do not exercise stdin/stdout!  If this
+check fails, try recompiling with USE_SETMODE or USE_FDOPEN defined.
+If it still doesn't work, better use two-file style.
+
+If you chose a memory manager other than jmemnobs.c, you should test that
+temporary-file usage works.  Try "djpeg -bmp -colors 256 -max 0 testorig.jpg"
+and make sure its output matches testimg.bmp.  If you have any really large
+images handy, try compressing them with -optimize and/or decompressing with
+-colors 256 to make sure your DEFAULT_MAX_MEM setting is not too large.
+
+NOTE: this is far from an exhaustive test of the JPEG software; some modules,
+such as 1-pass color quantization, are not exercised at all.  It's just a
+quick test to give you some confidence that you haven't missed something
+major.
+
+
+INSTALLING THE SOFTWARE
+=======================
+
+Once you're done with the above steps, you can install the software by
+copying the executable files (cjpeg, djpeg, jpegtran, rdjpgcom, and wrjpgcom)
+to wherever you normally install programs.  On Unix systems, you'll also want
+to put the man pages (cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1)
+in the man-page directory.  The pre-fab makefiles don't support this step
+since there's such a wide variety of installation procedures on different
+systems.
+
+If you generated a Makefile with the "configure" script, you can just say
+	make install
+to install the programs and their man pages into the standard places.
+(You'll probably need to be root to do this.)  We recommend first saying
+	make -n install
+to see where configure thought the files should go.  You may need to edit
+the Makefile, particularly if your system's conventions for man page
+filenames don't match what configure expects.
+
+If you want to install the IJG library itself, for use in compiling other
+programs besides ours, then you need to put the four include files
+	jpeglib.h jerror.h jconfig.h jmorecfg.h
+into your include-file directory, and put the library file libjpeg.a
+(extension may vary depending on system) wherever library files go.
+If you generated a Makefile with "configure", it will do what it thinks
+is the right thing if you say
+	make install-lib
+
+
+OPTIONAL STUFF
+==============
+
+Progress monitor:
+
+If you like, you can #define PROGRESS_REPORT (in jconfig.h) to enable display
+of percent-done progress reports.  The routine provided in cdjpeg.c merely
+prints percentages to stderr, but you can customize it to do something
+fancier.
+
+Utah RLE file format support:
+
+We distribute the software with support for RLE image files (Utah Raster
+Toolkit format) disabled, because the RLE support won't compile without the
+Utah library.  If you have URT version 3.1 or later, you can enable RLE
+support as follows:
+	1.  #define RLE_SUPPORTED in jconfig.h.
+	2.  Add a -I option to CFLAGS in the Makefile for the directory
+	    containing the URT .h files (typically the "include"
+	    subdirectory of the URT distribution).
+	3.  Add -L... -lrle to LDLIBS in the Makefile, where ... specifies
+	    the directory containing the URT "librle.a" file (typically the
+	    "lib" subdirectory of the URT distribution).
+
+Support for 12-bit-deep pixel data:
+
+The JPEG standard allows either 8-bit or 12-bit data precision.  (For color,
+this means 8 or 12 bits per channel, of course.)  If you need to work with
+deeper than 8-bit data, you can compile the IJG code for 12-bit operation.
+To do so:
+  1. In jmorecfg.h, define BITS_IN_JSAMPLE as 12 rather than 8.
+  2. In jconfig.h, undefine BMP_SUPPORTED, RLE_SUPPORTED, and TARGA_SUPPORTED,
+     because the code for those formats doesn't handle 12-bit data and won't
+     even compile.  (The PPM code does work, as explained below.  The GIF
+     code works too; it scales 8-bit GIF data to and from 12-bit depth
+     automatically.)
+  3. Compile.  Don't expect "make test" to pass, since the supplied test
+     files are for 8-bit data.
+
+Currently, 12-bit support does not work on 16-bit-int machines.
+
+Note that a 12-bit version will not read 8-bit JPEG files, nor vice versa;
+so you'll want to keep around a regular 8-bit compilation as well.
+(Run-time selection of data depth, to allow a single copy that does both,
+is possible but would probably slow things down considerably; it's very low
+on our to-do list.)
+
+The PPM reader (rdppm.c) can read 12-bit data from either text-format or
+binary-format PPM and PGM files.  Binary-format PPM/PGM files which have a
+maxval greater than 255 are assumed to use 2 bytes per sample, MSB first
+(big-endian order).  As of early 1995, 2-byte binary format is not
+officially supported by the PBMPLUS library, but it is expected that a
+future release of PBMPLUS will support it.  Note that the PPM reader will
+read files of any maxval regardless of the BITS_IN_JSAMPLE setting; incoming
+data is automatically rescaled to either maxval=255 or maxval=4095 as
+appropriate for the cjpeg bit depth.
+
+The PPM writer (wrppm.c) will normally write 2-byte binary PPM or PGM
+format, maxval 4095, when compiled with BITS_IN_JSAMPLE=12.  Since this
+format is not yet widely supported, you can disable it by compiling wrppm.c
+with PPM_NORAWWORD defined; then the data is scaled down to 8 bits to make a
+standard 1-byte/sample PPM or PGM file.  (Yes, this means still another copy
+of djpeg to keep around.  But hopefully you won't need it for very long.
+Poskanzer's supposed to get that new PBMPLUS release out Real Soon Now.)
+
+Of course, if you are working with 12-bit data, you probably have it stored
+in some other, nonstandard format.  In that case you'll probably want to
+write your own I/O modules to read and write your format.
+
+Note that a 12-bit version of cjpeg always runs in "-optimize" mode, in
+order to generate valid Huffman tables.  This is necessary because our
+default Huffman tables only cover 8-bit data.
+
+Removing code:
+
+If you need to make a smaller version of the JPEG software, some optional
+functions can be removed at compile time.  See the xxx_SUPPORTED #defines in
+jconfig.h and jmorecfg.h.  If at all possible, we recommend that you leave in
+decoder support for all valid JPEG files, to ensure that you can read anyone's
+output.  Taking out support for image file formats that you don't use is the
+most painless way to make the programs smaller.  Another possibility is to
+remove some of the DCT methods: in particular, the "IFAST" method may not be
+enough faster than the others to be worth keeping on your machine.  (If you
+do remove ISLOW or IFAST, be sure to redefine JDCT_DEFAULT or JDCT_FASTEST
+to a supported method, by adding a #define in jconfig.h.)
+
+
+OPTIMIZATION
+============
+
+Unless you own a Cray, you'll probably be interested in making the JPEG
+software go as fast as possible.  This section covers some machine-dependent
+optimizations you may want to try.  We suggest that before trying any of
+this, you first get the basic installation to pass the self-test step.
+Repeat the self-test after any optimization to make sure that you haven't
+broken anything.
+
+The integer DCT routines perform a lot of multiplications.  These
+multiplications must yield 32-bit results, but none of their input values
+are more than 16 bits wide.  On many machines, notably the 680x0 and 80x86
+CPUs, a 16x16=>32 bit multiply instruction is faster than a full 32x32=>32
+bit multiply.  Unfortunately there is no portable way to specify such a
+multiplication in C, but some compilers can generate one when you use the
+right combination of casts.  See the MULTIPLYxxx macro definitions in
+jdct.h.  If your compiler makes "int" be 32 bits and "short" be 16 bits,
+defining SHORTxSHORT_32 is fairly likely to work.  When experimenting with
+alternate definitions, be sure to test not only whether the code still works
+(use the self-test), but also whether it is actually faster --- on some
+compilers, alternate definitions may compute the right answer, yet be slower
+than the default.  Timing cjpeg on a large PGM (grayscale) input file is the
+best way to check this, as the DCT will be the largest fraction of the runtime
+in that mode.  (Note: some of the distributed compiler-specific jconfig files
+already contain #define switches to select appropriate MULTIPLYxxx
+definitions.)
+
+If your machine has sufficiently fast floating point hardware, you may find
+that the float DCT method is faster than the integer DCT methods, even
+after tweaking the integer multiply macros.  In that case you may want to
+make the float DCT be the default method.  (The only objection to this is
+that float DCT results may vary slightly across machines.)  To do that, add
+"#define JDCT_DEFAULT JDCT_FLOAT" to jconfig.h.  Even if you don't change
+the default, you should redefine JDCT_FASTEST, which is the method selected
+by djpeg's -fast switch.  Don't forget to update the documentation files
+(usage.txt and/or cjpeg.1, djpeg.1) to agree with what you've done.
+
+If access to "short" arrays is slow on your machine, it may be a win to
+define type JCOEF as int rather than short.  This will cost a good deal of
+memory though, particularly in some multi-pass modes, so don't do it unless
+you have memory to burn and short is REALLY slow.
+
+If your compiler can compile function calls in-line, make sure the INLINE
+macro in jmorecfg.h is defined as the keyword that marks a function
+inline-able.  Some compilers have a switch that tells the compiler to inline
+any function it thinks is profitable (e.g., -finline-functions for gcc).
+Enabling such a switch is likely to make the compiled code bigger but faster.
+
+In general, it's worth trying the maximum optimization level of your compiler,
+and experimenting with any optional optimizations such as loop unrolling.
+(Unfortunately, far too many compilers have optimizer bugs ... be prepared to
+back off if the code fails self-test.)  If you do any experimentation along
+these lines, please report the optimal settings to jpeg-info@uc.ag so we
+can mention them in future releases.  Be sure to specify your machine
+and compiler version.
+
+
+HINTS FOR SPECIFIC SYSTEMS
+==========================
+
+We welcome reports on changes needed for systems not mentioned here.  Submit
+'em to jpeg-info@uc.ag.  Also, if configure or ckconfig.c is wrong about how
+to configure the JPEG software for your system, please let us know.
+
+
+Acorn RISC OS:
+
+(Thanks to Simon Middleton for these hints on compiling with Desktop C.)
+After renaming the files according to Acorn conventions, take a copy of
+makefile.ansi, change all occurrences of 'libjpeg.a' to 'libjpeg.o' and
+change these definitions as indicated:
+
+CFLAGS= -throwback -IC: -Wn
+LDLIBS=C:o.Stubs
+SYSDEPMEM=jmemansi.o
+LN=Link
+AR=LibFile -c -o
+
+Also add a new line '.c.o:; $(cc) $< $(cflags) -c -o $@'.  Remove the
+lines '$(RM) libjpeg.o' and '$(AR2) libjpeg.o' and the 'jconfig.h'
+dependency section.
+
+Copy jconfig.txt to jconfig.h.  Edit jconfig.h to define TWO_FILE_COMMANDLINE
+and CHAR_IS_UNSIGNED.
+
+Run the makefile using !AMU not !Make.  If you want to use the 'clean' and
+'test' makefile entries then you will have to fiddle with the syntax a bit
+and rename the test files.
+
+
+Amiga:
+
+SAS C 6.50 reportedly is too buggy to compile the IJG code properly.
+A patch to update to 6.51 is available from SAS or AmiNet FTP sites.
+
+The supplied config files are set up to use jmemname.c as the memory
+manager, with temporary files being created on the device named by
+"JPEGTMP:".
+
+
+Atari ST/STE/TT:
+
+Copy the project files makcjpeg.st, makdjpeg.st, maktjpeg.st, and makljpeg.st
+to cjpeg.prj, djpeg.prj, jpegtran.prj, and libjpeg.prj respectively.  The
+project files should work as-is with Pure C.  For Turbo C, change library
+filenames "pc..." to "tc..." in each project file.  Note that libjpeg.prj
+selects jmemansi.c as the recommended memory manager.  You'll probably want to
+adjust the DEFAULT_MAX_MEM setting --- you want it to be a couple hundred K
+less than your normal free memory.  Put "#define DEFAULT_MAX_MEM nnnn" into
+jconfig.h to do this.
+
+To use the 68881/68882 coprocessor for the floating point DCT, add the
+compiler option "-8" to the project files and replace pcfltlib.lib with
+pc881lib.lib in cjpeg.prj and djpeg.prj.  Or if you don't have a
+coprocessor, you may prefer to remove the float DCT code by undefining
+DCT_FLOAT_SUPPORTED in jmorecfg.h (since without a coprocessor, the float
+code will be too slow to be useful).  In that case, you can delete
+pcfltlib.lib from the project files.
+
+Note that you must make libjpeg.lib before making cjpeg.ttp, djpeg.ttp,
+or jpegtran.ttp.  You'll have to perform the self-test by hand.
+
+We haven't bothered to include project files for rdjpgcom and wrjpgcom.
+Those source files should just be compiled by themselves; they don't
+depend on the JPEG library.  You can use the default.prj project file
+of the Pure C distribution to make the programs.
+
+There is a bug in some older versions of the Turbo C library which causes the
+space used by temporary files created with "tmpfile()" not to be freed after
+an abnormal program exit.  If you check your disk afterwards, you will find
+cluster chains that are allocated but not used by a file.  This should not
+happen in cjpeg/djpeg/jpegtran, since we enable a signal catcher to explicitly
+close temp files before exiting.  But if you use the JPEG library with your
+own code, be sure to supply a signal catcher, or else use a different
+system-dependent memory manager.
+
+
+Cray:
+
+Should you be so fortunate as to be running JPEG on a Cray YMP, there is a
+compiler bug in old versions of Cray's Standard C (prior to 3.1).  If you
+still have an old compiler, you'll need to insert a line reading
+"#pragma novector" just before the loop	
+    for (i = 1; i <= (int) htbl->bits[l]; i++)
+      huffsize[p++] = (char) l;
+in fix_huff_tbl (in V5beta1, line 204 of jchuff.c and line 176 of jdhuff.c).
+[This bug may or may not still occur with the current IJG code, but it's
+probably a dead issue anyway...]
+
+
+HP-UX:
+
+If you have HP-UX 7.05 or later with the "software development" C compiler,
+you should run the compiler in ANSI mode.  If using the configure script,
+say
+	./configure CC='cc -Aa'
+(or -Ae if you prefer).  If configuring by hand, use makefile.ansi and add
+"-Aa" to the CFLAGS line in the makefile.
+
+If you have a pre-7.05 system, or if you are using the non-ANSI C compiler
+delivered with a minimum HP-UX system, then you must use makefile.unix
+(and do NOT add -Aa); or just run configure without the CC option.
+
+On HP 9000 series 800 machines, the HP C compiler is buggy in revisions prior
+to A.08.07.  If you get complaints about "not a typedef name", you'll have to
+use makefile.unix, or run configure without the CC option.
+
+
+Macintosh, generic comments:
+
+The supplied user-interface files (cjpeg.c, djpeg.c, etc) are set up to
+provide a Unix-style command line interface.  You can use this interface on
+the Mac by means of the ccommand() library routine provided by Metrowerks
+CodeWarrior or Think C.  This is only appropriate for testing the library,
+however; to make a user-friendly equivalent of cjpeg/djpeg you'd really want
+to develop a Mac-style user interface.  There isn't a complete example
+available at the moment, but there are some helpful starting points:
+1. Sam Bushell's free "To JPEG" applet provides drag-and-drop conversion to
+JPEG under System 7 and later.  This only illustrates how to use the
+compression half of the library, but it does a very nice job of that part.
+The CodeWarrior source code is available from http://www.pobox.com/~jsam.
+2. Jim Brunner prepared a Mac-style user interface for both compression and
+decompression.  Unfortunately, it hasn't been updated since IJG v4, and
+the library's API has changed considerably since then.  Still it may be of
+some help, particularly as a guide to compiling the IJG code under Think C.
+Jim's code is available from the Info-Mac archives, at sumex-aim.stanford.edu
+or mirrors thereof; see file /info-mac/dev/src/jpeg-convert-c.hqx.
+
+jmemmac.c is the recommended memory manager back end for Macintosh.  It uses
+NewPtr/DisposePtr instead of malloc/free, and has a Mac-specific
+implementation of jpeg_mem_available().  It also creates temporary files that
+follow Mac conventions.  (That part of the code relies on System-7-or-later OS
+functions.  See the comments in jmemmac.c if you need to run it on System 6.)
+NOTE that USE_MAC_MEMMGR must be defined in jconfig.h to use jmemmac.c.
+
+You can also use jmemnobs.c, if you don't care about handling images larger
+than available memory.  If you use any memory manager back end other than
+jmemmac.c, we recommend replacing "malloc" and "free" by "NewPtr" and
+"DisposePtr", because Mac C libraries often have peculiar implementations of
+malloc/free.  (For instance, free() may not return the freed space to the
+Mac Memory Manager.  This is undesirable for the IJG code because jmemmgr.c
+already clumps space requests.)
+
+
+Macintosh, Metrowerks CodeWarrior:
+
+The Unix-command-line-style interface can be used by defining USE_CCOMMAND.
+You'll also need to define TWO_FILE_COMMANDLINE to avoid stdin/stdout.
+This means that when using the cjpeg/djpeg programs, you'll have to type the
+input and output file names in the "Arguments" text-edit box, rather than
+using the file radio buttons.  (Perhaps USE_FDOPEN or USE_SETMODE would
+eliminate the problem, but I haven't heard from anyone who's tried it.)
+
+On 680x0 Macs, Metrowerks defines type "double" as a 10-byte IEEE extended
+float.  jmemmgr.c won't like this: it wants sizeof(ALIGN_TYPE) to be a power
+of 2.  Add "#define ALIGN_TYPE long" to jconfig.h to eliminate the complaint.
+
+The supplied configuration file jconfig.mac can be used for your jconfig.h;
+it includes all the recommended symbol definitions.  If you have AppleScript
+installed, you can run the supplied script makeproj.mac to create CodeWarrior
+project files for the library and the testbed applications, then build the
+library and applications.  (Thanks to Dan Sears and Don Agro for this nifty
+hack, which saves us from trying to maintain CodeWarrior project files as part
+of the IJG distribution...)
+
+
+Macintosh, Think C:
+
+The documentation in Jim Brunner's "JPEG Convert" source code (see above)
+includes detailed build instructions for Think C; it's probably somewhat
+out of date for the current release, but may be helpful.
+
+If you want to build the minimal command line version, proceed as follows.
+You'll have to prepare project files for the programs; we don't include any
+in the distribution since they are not text files.  Use the file lists in
+any of the supplied makefiles as a guide.  Also add the ANSI and Unix C
+libraries in a separate segment.  You may need to divide the JPEG files into
+more than one segment; we recommend dividing compression and decompression
+modules.  Define USE_CCOMMAND in jconfig.h so that the ccommand() routine is
+called.  You must also define TWO_FILE_COMMANDLINE because stdin/stdout
+don't handle binary data correctly.
+
+On 680x0 Macs, Think C defines type "double" as a 12-byte IEEE extended float.
+jmemmgr.c won't like this: it wants sizeof(ALIGN_TYPE) to be a power of 2.
+Add "#define ALIGN_TYPE long" to jconfig.h to eliminate the complaint.
+
+jconfig.mac should work as a jconfig.h configuration file for Think C,
+but the makeproj.mac AppleScript script is specific to CodeWarrior.  Sorry.
+
+
+MIPS R3000:
+
+MIPS's cc version 1.31 has a rather nasty optimization bug.  Don't use -O
+if you have that compiler version.  (Use "cc -V" to check the version.)
+Note that the R3000 chip is found in workstations from DEC and others.
+
+
+MS-DOS, generic comments for 16-bit compilers:
+
+The IJG code is designed to work well in 80x86 "small" or "medium" memory
+models (i.e., data pointers are 16 bits unless explicitly declared "far";
+code pointers can be either size).  You may be able to use small model to
+compile cjpeg or djpeg by itself, but you will probably have to use medium
+model for any larger application.  This won't make much difference in
+performance.  You *will* take a noticeable performance hit if you use a
+large-data memory model, and you should avoid "huge" model if at all
+possible.  Be sure that NEED_FAR_POINTERS is defined in jconfig.h if you use
+a small-data memory model; be sure it is NOT defined if you use a large-data
+model.  (The supplied makefiles and jconfig files for Borland and Microsoft C
+compile in medium model and define NEED_FAR_POINTERS.)
+
+The DOS-specific memory manager, jmemdos.c, should be used if possible.
+It needs some assembly-code routines which are in jmemdosa.asm; make sure
+your makefile assembles that file and includes it in the library.  If you
+don't have a suitable assembler, you can get pre-assembled object files for
+jmemdosa by FTP from ftp.uu.net:/graphics/jpeg/jdosaobj.zip.  (DOS-oriented
+distributions of the IJG source code often include these object files.)
+
+When using jmemdos.c, jconfig.h must define USE_MSDOS_MEMMGR and must set
+MAX_ALLOC_CHUNK to less than 64K (65520L is a typical value).  If your
+C library's far-heap malloc() can't allocate blocks that large, reduce
+MAX_ALLOC_CHUNK to whatever it can handle.
+
+If you can't use jmemdos.c for some reason --- for example, because you
+don't have an assembler to assemble jmemdosa.asm --- you'll have to fall
+back to jmemansi.c or jmemname.c.  You'll probably still need to set
+MAX_ALLOC_CHUNK in jconfig.h, because most DOS C libraries won't malloc()
+more than 64K at a time.  IMPORTANT: if you use jmemansi.c or jmemname.c,
+you will have to compile in a large-data memory model in order to get the
+right stdio library.  Too bad.
+
+wrjpgcom needs to be compiled in large model, because it malloc()s a 64KB
+work area to hold the comment text.  If your C library's malloc can't
+handle that, reduce MAX_COM_LENGTH as necessary in wrjpgcom.c.
+
+Most MS-DOS compilers treat stdin/stdout as text files, so you must use
+two-file command line style.  But if your compiler has either fdopen() or
+setmode(), you can use one-file style if you like.  To do this, define
+USE_SETMODE or USE_FDOPEN so that stdin/stdout will be set to binary mode.
+(USE_SETMODE seems to work with more DOS compilers than USE_FDOPEN.)  You
+should test that I/O through stdin/stdout produces the same results as I/O
+to explicitly named files... the "make test" procedures in the supplied
+makefiles do NOT use stdin/stdout.
+
+
+MS-DOS, generic comments for 32-bit compilers:
+
+None of the above comments about memory models apply if you are using a
+32-bit flat-memory-space environment, such as DJGPP or Watcom C.  (And you
+should use one if you have it, as performance will be much better than
+8086-compatible code!)  For flat-memory-space compilers, do NOT define
+NEED_FAR_POINTERS, and do NOT use jmemdos.c.  Use jmemnobs.c if the
+environment supplies adequate virtual memory, otherwise use jmemansi.c or
+jmemname.c.
+
+You'll still need to be careful about binary I/O through stdin/stdout.
+See the last paragraph of the previous section.
+
+
+MS-DOS, Borland C:
+
+Be sure to convert all the source files to DOS text format (CR/LF newlines).
+Although Borland C will often work OK with unmodified Unix (LF newlines)
+source files, sometimes it will give bogus compile errors.
+"Illegal character '#'" is the most common such error.  (This is true with
+Borland C 3.1, but perhaps is fixed in newer releases.)
+
+If you want one-file command line style, just undefine TWO_FILE_COMMANDLINE.
+jconfig.bcc already includes #define USE_SETMODE to make this work.
+(fdopen does not work correctly.)
+
+
+MS-DOS, Microsoft C:
+
+makefile.mc6 works with Microsoft C, DOS Visual C++, etc.  It should only
+be used if you want to build a 16-bit (small or medium memory model) program.
+
+If you want one-file command line style, just undefine TWO_FILE_COMMANDLINE.
+jconfig.mc6 already includes #define USE_SETMODE to make this work.
+(fdopen does not work correctly.)
+
+Note that this makefile assumes that the working copy of itself is called
+"makefile".  If you want to call it something else, say "makefile.mak",
+be sure to adjust the dependency line that reads "$(RFILE) : makefile".
+Otherwise the make will fail because it doesn't know how to create "makefile".
+Worse, some releases of Microsoft's make utilities give an incorrect error
+message in this situation.
+
+Old versions of MS C fail with an "out of macro expansion space" error
+because they can't cope with the macro TRACEMS8 (defined in jerror.h).
+If this happens to you, the easiest solution is to change TRACEMS8 to
+expand to nothing.  You'll lose the ability to dump out JPEG coefficient
+tables with djpeg -debug -debug, but at least you can compile.
+
+Original MS C 6.0 is very buggy; it compiles incorrect code unless you turn
+off optimization entirely (remove -O from CFLAGS).  6.00A is better, but it
+still generates bad code if you enable loop optimizations (-Ol or -Ox).
+
+MS C 8.0 crashes when compiling jquant1.c with optimization switch /Oo ...
+which is on by default.  To work around this bug, compile that one file
+with /Oo-.
+
+
+Microsoft Windows (all versions), generic comments:
+
+Some Windows system include files define typedef boolean as "unsigned char".
+The IJG code also defines typedef boolean, but we make it "int" by default.
+This doesn't affect the IJG programs because we don't import those Windows
+include files.  But if you use the JPEG library in your own program, and some
+of your program's files import one definition of boolean while some import the
+other, you can get all sorts of mysterious problems.  A good preventive step
+is to make the IJG library use "unsigned char" for boolean.  To do that,
+add something like this to your jconfig.h file:
+	/* Define "boolean" as unsigned char, not int, per Windows custom */
+	#ifndef __RPCNDR_H__	/* don't conflict if rpcndr.h already read */
+	typedef unsigned char boolean;
+	#endif
+	#define HAVE_BOOLEAN	/* prevent jmorecfg.h from redefining it */
+(This is already in jconfig.vc, by the way.)
+
+windef.h contains the declarations
+	#define far
+	#define FAR far
+Since jmorecfg.h tries to define FAR as empty, you may get a compiler
+warning if you include both jpeglib.h and windef.h (which windows.h
+includes).  To suppress the warning, you can put "#ifndef FAR"/"#endif"
+around the line "#define FAR" in jmorecfg.h.
+(Something like this is already in jmorecfg.h, by the way.)
+
+When using the library in a Windows application, you will almost certainly
+want to modify or replace the error handler module jerror.c, since our
+default error handler does a couple of inappropriate things:
+  1. it tries to write error and warning messages on stderr;
+  2. in event of a fatal error, it exits by calling exit().
+
+A simple stopgap solution for problem 1 is to replace the line
+	fprintf(stderr, "%s\n", buffer);
+(in output_message in jerror.c) with
+	MessageBox(GetActiveWindow(),buffer,"JPEG Error",MB_OK|MB_ICONERROR);
+It's highly recommended that you at least do that much, since otherwise
+error messages will disappear into nowhere.  (Beginning with IJG v6b, this
+code is already present in jerror.c; just define USE_WINDOWS_MESSAGEBOX in
+jconfig.h to enable it.)
+
+The proper solution for problem 2 is to return control to your calling
+application after a library error.  This can be done with the setjmp/longjmp
+technique discussed in libjpeg.txt and illustrated in example.c.  (NOTE:
+some older Windows C compilers provide versions of setjmp/longjmp that
+don't actually work under Windows.  You may need to use the Windows system
+functions Catch and Throw instead.)
+
+The recommended memory manager under Windows is jmemnobs.c; in other words,
+let Windows do any virtual memory management needed.  You should NOT use
+jmemdos.c nor jmemdosa.asm under Windows.
+
+For Windows 3.1, we recommend compiling in medium or large memory model;
+for newer Windows versions, use a 32-bit flat memory model.  (See the MS-DOS
+sections above for more info about memory models.)  In the 16-bit memory
+models only, you'll need to put
+	#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
+into jconfig.h to limit allocation chunks to 64Kb.  (Without that, you'd
+have to use huge memory model, which slows things down unnecessarily.)
+jmemnobs.c works without modification in large or flat memory models, but to
+use medium model, you need to modify its jpeg_get_large and jpeg_free_large
+routines to allocate far memory.  In any case, you might like to replace
+its calls to malloc and free with direct calls on Windows memory allocation
+functions.
+
+You may also want to modify jdatasrc.c and jdatadst.c to use Windows file
+operations rather than fread/fwrite.  This is only necessary if your C
+compiler doesn't provide a competent implementation of C stdio functions.
+
+You might want to tweak the RGB_xxx macros in jmorecfg.h so that the library
+will accept or deliver color pixels in BGR sample order, not RGB; BGR order
+is usually more convenient under Windows.  Note that this change will break
+the sample applications cjpeg/djpeg, but the library itself works fine.
+
+
+Many people want to convert the IJG library into a DLL.  This is reasonably
+straightforward, but watch out for the following:
+
+  1. Don't try to compile as a DLL in small or medium memory model; use
+large model, or even better, 32-bit flat model.  Many places in the IJG code
+assume the address of a local variable is an ordinary (not FAR) pointer;
+that isn't true in a medium-model DLL.
+
+  2. Microsoft C cannot pass file pointers between applications and DLLs.
+(See Microsoft Knowledge Base, PSS ID Number Q50336.)  So jdatasrc.c and
+jdatadst.c don't work if you open a file in your application and then pass
+the pointer to the DLL.  One workaround is to make jdatasrc.c/jdatadst.c
+part of your main application rather than part of the DLL.
+
+  3. You'll probably need to modify the macros GLOBAL() and EXTERN() to
+attach suitable linkage keywords to the exported routine names.  Similarly,
+you'll want to modify METHODDEF() and JMETHOD() to ensure function pointers
+are declared in a way that lets application routines be called back through
+the function pointers.  These macros are in jmorecfg.h.  Typical definitions
+for a 16-bit DLL are:
+	#define GLOBAL(type)		type _far _pascal _loadds _export
+	#define EXTERN(type)		extern type _far _pascal _loadds
+	#define METHODDEF(type)		static type _far _pascal
+	#define JMETHOD(type,methodname,arglist)  \
+		type (_far _pascal *methodname) arglist
+For a 32-bit DLL you may want something like
+	#define GLOBAL(type)		__declspec(dllexport) type
+	#define EXTERN(type)		extern __declspec(dllexport) type
+Although not all the GLOBAL routines are actually intended to be called by
+the application, the performance cost of making them all DLL entry points is
+negligible.
+
+The unmodified IJG library presents a very C-specific application interface,
+so the resulting DLL is only usable from C or C++ applications.  There has
+been some talk of writing wrapper code that would present a simpler interface
+usable from other languages, such as Visual Basic.  This is on our to-do list
+but hasn't been very high priority --- any volunteers out there?
+
+
+Microsoft Windows, Borland C:
+
+The provided jconfig.bcc should work OK in a 32-bit Windows environment,
+but you'll need to tweak it in a 16-bit environment (you'd need to define
+NEED_FAR_POINTERS and MAX_ALLOC_CHUNK).  Beware that makefile.bcc will need
+alteration if you want to use it for Windows --- in particular, you should
+use jmemnobs.c not jmemdos.c under Windows.
+
+Borland C++ 4.5 fails with an internal compiler error when trying to compile
+jdmerge.c in 32-bit mode.  If enough people complain, perhaps Borland will fix
+it.  In the meantime, the simplest known workaround is to add a redundant
+definition of the variable range_limit in h2v1_merged_upsample(), at the head
+of the block that handles odd image width (about line 268 in v6 jdmerge.c):
+  /* If image width is odd, do the last output column separately */
+  if (cinfo->output_width & 1) {
+    register JSAMPLE * range_limit = cinfo->sample_range_limit; /* ADD THIS */
+    cb = GETJSAMPLE(*inptr1);
+Pretty bizarre, especially since the very similar routine h2v2_merged_upsample
+doesn't trigger the bug.
+Recent reports suggest that this bug does not occur with "bcc32a" (the
+Pentium-optimized version of the compiler).
+
+Another report from a user of Borland C 4.5 was that incorrect code (leading
+to a color shift in processed images) was produced if any of the following
+optimization switch combinations were used: 
+	-Ot -Og
+	-Ot -Op
+	-Ot -Om
+So try backing off on optimization if you see such a problem.  (Are there
+several different releases all numbered "4.5"??)
+
+
+Microsoft Windows, Microsoft Visual C++:
+
+jconfig.vc should work OK with any Microsoft compiler for a 32-bit memory
+model.  makefile.vc is intended for command-line use.  (If you are using
+the Developer Studio environment, you may prefer the DevStudio project
+files; see below.)
+
+IJG JPEG 7 adds extern "C" to jpeglib.h.  This avoids the need to put
+extern "C" { ... } around #include "jpeglib.h" in your C++ application.
+You can also force VC++ to treat the library as C++ code by renaming
+all the *.c files to *.cpp (and adjusting the makefile to match).
+In this case you also need to define the symbol DONT_USE_EXTERN_C in
+the configuration to prevent jpeglib.h from using extern "C".
+
+
+Microsoft Windows, Microsoft Visual C++ 6 Developer Studio:
+
+We include makefiles that should work as project files in DevStudio 6.0 or
+later.  There is a library makefile that builds the IJG library as a static
+Win32 library, and application makefiles that build the sample applications
+as Win32 console applications.  (Even if you only want the library, we
+recommend building the applications so that you can run the self-test.)
+
+To use:
+1. Open the command prompt, change to the main directory and execute the
+   command line
+	NMAKE /f makefile.vc  setup-vc6
+   This will move jconfig.vc to jconfig.h and makefiles to project files.
+   (Note that the renaming is critical!)
+2. Open the workspace file jpeg.dsw, build the library project.
+   (If you are using DevStudio more recent than 6.0, you'll probably
+   get a message saying that the project files are being updated.)
+3. Open the workspace file apps.dsw, build the application projects.
+4. To perform the self-test, execute the command line
+	NMAKE /f makefile.vc  test-build
+5. Move the application .exe files from `app`\Release to an
+   appropriate location on your path.
+
+
+Microsoft Windows, Microsoft Visual C++ 2010 Developer Studio (v10):
+
+We include makefiles that should work as project files in Visual Studio
+2010 or later.  There is a library makefile that builds the IJG library
+as a static Win32 library, and application makefiles that build the sample
+applications as Win32 console applications.  (Even if you only want the
+library, we recommend building the applications so that you can run the
+self-test.)
+
+To use:
+1. Open the command prompt, change to the main directory and execute the
+   command line
+	NMAKE /f makefile.vc  setup-v10
+   This will move jconfig.vc to jconfig.h and makefiles to project files.
+   (Note that the renaming is critical!)
+2. Open the solution file jpeg.sln, build the library project.
+   (If you are using Visual Studio more recent than 2010 (v10), you'll
+   probably get a message saying that the project files are being updated.)
+3. Open the solution file apps.sln, build the application projects.
+4. To perform the self-test, execute the command line
+	NMAKE /f makefile.vc  test-build
+5. Move the application .exe files from `app`\Release to an
+   appropriate location on your path.
+
+Note:
+There seems to be an optimization bug in the compiler which causes the
+self-test to fail with the color quantization option.
+We have disabled optimization for the file jquant2.c in the library
+project file which causes the self-test to pass properly.
+
+
+OS/2, Borland C++:
+
+Watch out for optimization bugs in older Borland compilers; you may need
+to back off the optimization switch settings.  See the comments in
+makefile.bcc.
+
+
+SGI:
+
+On some SGI systems, you may need to set "AR2= ar -ts" in the Makefile.
+If you are using configure, you can do this by saying
+	./configure RANLIB='ar -ts'
+This change is not needed on all SGIs.  Use it only if the make fails at the
+stage of linking the completed programs.
+
+On the MIPS R4000 architecture (Indy, etc.), the compiler option "-mips2"
+reportedly speeds up the float DCT method substantially, enough to make it
+faster than the default int method (but still slower than the fast int
+method).  If you use -mips2, you may want to alter the default DCT method to
+be float.  To do this, put "#define JDCT_DEFAULT JDCT_FLOAT" in jconfig.h.
+
+
+VMS:
+
+On an Alpha/VMS system with MMS, be sure to use the "/Marco=Alpha=1"
+qualifier with MMS when building the JPEG package.
+
+VAX/VMS v5.5-1 may have problems with the test step of the build procedure
+reporting differences when it compares the original and test images.  If the
+error points to the last block of the files, it is most likely bogus and may
+be safely ignored.  It seems to be because the files are Stream_LF and
+Backup/Compare has difficulty with the (presumably) null padded files.
+This problem was not observed on VAX/VMS v6.1 or AXP/VMS v6.1.
diff --git a/jpeg/jaricom.c b/jpeg/jaricom.c
new file mode 100644
index 000000000..f43e2ea7f
--- /dev/null
+++ b/jpeg/jaricom.c
@@ -0,0 +1,153 @@
+/*
+ * jaricom.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
+ * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
+ * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact INT32 representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation.  The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+
+const INT32 jpeg_aritab[113+1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+  V(   0, 0x5a1d,   1,   1, 1 ),
+  V(   1, 0x2586,  14,   2, 0 ),
+  V(   2, 0x1114,  16,   3, 0 ),
+  V(   3, 0x080b,  18,   4, 0 ),
+  V(   4, 0x03d8,  20,   5, 0 ),
+  V(   5, 0x01da,  23,   6, 0 ),
+  V(   6, 0x00e5,  25,   7, 0 ),
+  V(   7, 0x006f,  28,   8, 0 ),
+  V(   8, 0x0036,  30,   9, 0 ),
+  V(   9, 0x001a,  33,  10, 0 ),
+  V(  10, 0x000d,  35,  11, 0 ),
+  V(  11, 0x0006,   9,  12, 0 ),
+  V(  12, 0x0003,  10,  13, 0 ),
+  V(  13, 0x0001,  12,  13, 0 ),
+  V(  14, 0x5a7f,  15,  15, 1 ),
+  V(  15, 0x3f25,  36,  16, 0 ),
+  V(  16, 0x2cf2,  38,  17, 0 ),
+  V(  17, 0x207c,  39,  18, 0 ),
+  V(  18, 0x17b9,  40,  19, 0 ),
+  V(  19, 0x1182,  42,  20, 0 ),
+  V(  20, 0x0cef,  43,  21, 0 ),
+  V(  21, 0x09a1,  45,  22, 0 ),
+  V(  22, 0x072f,  46,  23, 0 ),
+  V(  23, 0x055c,  48,  24, 0 ),
+  V(  24, 0x0406,  49,  25, 0 ),
+  V(  25, 0x0303,  51,  26, 0 ),
+  V(  26, 0x0240,  52,  27, 0 ),
+  V(  27, 0x01b1,  54,  28, 0 ),
+  V(  28, 0x0144,  56,  29, 0 ),
+  V(  29, 0x00f5,  57,  30, 0 ),
+  V(  30, 0x00b7,  59,  31, 0 ),
+  V(  31, 0x008a,  60,  32, 0 ),
+  V(  32, 0x0068,  62,  33, 0 ),
+  V(  33, 0x004e,  63,  34, 0 ),
+  V(  34, 0x003b,  32,  35, 0 ),
+  V(  35, 0x002c,  33,   9, 0 ),
+  V(  36, 0x5ae1,  37,  37, 1 ),
+  V(  37, 0x484c,  64,  38, 0 ),
+  V(  38, 0x3a0d,  65,  39, 0 ),
+  V(  39, 0x2ef1,  67,  40, 0 ),
+  V(  40, 0x261f,  68,  41, 0 ),
+  V(  41, 0x1f33,  69,  42, 0 ),
+  V(  42, 0x19a8,  70,  43, 0 ),
+  V(  43, 0x1518,  72,  44, 0 ),
+  V(  44, 0x1177,  73,  45, 0 ),
+  V(  45, 0x0e74,  74,  46, 0 ),
+  V(  46, 0x0bfb,  75,  47, 0 ),
+  V(  47, 0x09f8,  77,  48, 0 ),
+  V(  48, 0x0861,  78,  49, 0 ),
+  V(  49, 0x0706,  79,  50, 0 ),
+  V(  50, 0x05cd,  48,  51, 0 ),
+  V(  51, 0x04de,  50,  52, 0 ),
+  V(  52, 0x040f,  50,  53, 0 ),
+  V(  53, 0x0363,  51,  54, 0 ),
+  V(  54, 0x02d4,  52,  55, 0 ),
+  V(  55, 0x025c,  53,  56, 0 ),
+  V(  56, 0x01f8,  54,  57, 0 ),
+  V(  57, 0x01a4,  55,  58, 0 ),
+  V(  58, 0x0160,  56,  59, 0 ),
+  V(  59, 0x0125,  57,  60, 0 ),
+  V(  60, 0x00f6,  58,  61, 0 ),
+  V(  61, 0x00cb,  59,  62, 0 ),
+  V(  62, 0x00ab,  61,  63, 0 ),
+  V(  63, 0x008f,  61,  32, 0 ),
+  V(  64, 0x5b12,  65,  65, 1 ),
+  V(  65, 0x4d04,  80,  66, 0 ),
+  V(  66, 0x412c,  81,  67, 0 ),
+  V(  67, 0x37d8,  82,  68, 0 ),
+  V(  68, 0x2fe8,  83,  69, 0 ),
+  V(  69, 0x293c,  84,  70, 0 ),
+  V(  70, 0x2379,  86,  71, 0 ),
+  V(  71, 0x1edf,  87,  72, 0 ),
+  V(  72, 0x1aa9,  87,  73, 0 ),
+  V(  73, 0x174e,  72,  74, 0 ),
+  V(  74, 0x1424,  72,  75, 0 ),
+  V(  75, 0x119c,  74,  76, 0 ),
+  V(  76, 0x0f6b,  74,  77, 0 ),
+  V(  77, 0x0d51,  75,  78, 0 ),
+  V(  78, 0x0bb6,  77,  79, 0 ),
+  V(  79, 0x0a40,  77,  48, 0 ),
+  V(  80, 0x5832,  80,  81, 1 ),
+  V(  81, 0x4d1c,  88,  82, 0 ),
+  V(  82, 0x438e,  89,  83, 0 ),
+  V(  83, 0x3bdd,  90,  84, 0 ),
+  V(  84, 0x34ee,  91,  85, 0 ),
+  V(  85, 0x2eae,  92,  86, 0 ),
+  V(  86, 0x299a,  93,  87, 0 ),
+  V(  87, 0x2516,  86,  71, 0 ),
+  V(  88, 0x5570,  88,  89, 1 ),
+  V(  89, 0x4ca9,  95,  90, 0 ),
+  V(  90, 0x44d9,  96,  91, 0 ),
+  V(  91, 0x3e22,  97,  92, 0 ),
+  V(  92, 0x3824,  99,  93, 0 ),
+  V(  93, 0x32b4,  99,  94, 0 ),
+  V(  94, 0x2e17,  93,  86, 0 ),
+  V(  95, 0x56a8,  95,  96, 1 ),
+  V(  96, 0x4f46, 101,  97, 0 ),
+  V(  97, 0x47e5, 102,  98, 0 ),
+  V(  98, 0x41cf, 103,  99, 0 ),
+  V(  99, 0x3c3d, 104, 100, 0 ),
+  V( 100, 0x375e,  99,  93, 0 ),
+  V( 101, 0x5231, 105, 102, 0 ),
+  V( 102, 0x4c0f, 106, 103, 0 ),
+  V( 103, 0x4639, 107, 104, 0 ),
+  V( 104, 0x415e, 103,  99, 0 ),
+  V( 105, 0x5627, 105, 106, 1 ),
+  V( 106, 0x50e7, 108, 107, 0 ),
+  V( 107, 0x4b85, 109, 103, 0 ),
+  V( 108, 0x5597, 110, 109, 0 ),
+  V( 109, 0x504f, 111, 107, 0 ),
+  V( 110, 0x5a10, 110, 111, 1 ),
+  V( 111, 0x5522, 112, 109, 0 ),
+  V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as recommended in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+  V( 113, 0x5a1d, 113, 113, 0 )
+};
diff --git a/jpeg/jcapimin.c b/jpeg/jcapimin.c
index 54fb8c58c..639ce86f4 100644
--- a/jpeg/jcapimin.c
+++ b/jpeg/jcapimin.c
@@ -2,6 +2,7 @@
  * jcapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -63,14 +64,21 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
 
   cinfo->comp_info = NULL;
 
-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
     cinfo->quant_tbl_ptrs[i] = NULL;
+    cinfo->q_scale_factor[i] = 100;
+  }
 
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     cinfo->dc_huff_tbl_ptrs[i] = NULL;
     cinfo->ac_huff_tbl_ptrs[i] = NULL;
   }
 
+  /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2-1;
+
   cinfo->script_space = NULL;
 
   cinfo->input_gamma = 1.0;	/* in case application forgets */
diff --git a/jpeg/jcarith.c b/jpeg/jcarith.c
new file mode 100644
index 000000000..0b7ea55d4
--- /dev/null
+++ b/jpeg/jcarith.c
@@ -0,0 +1,934 @@
+/*
+ * jcarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  INT32 a;               /* A register, normalized size of coding interval */
+  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
+  INT32 zc;          /* counter for pending 0x00 output values which might *
+                          * be discarded at the end ("Pacman" termination) */
+  int ct;  /* bit shift counter, determines when next byte will be written */
+  int buffer;                /* buffer for most recent output byte != 0xFF */
+
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  int next_restart_num;		/* next restart number to write (0-7) */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte (int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+  struct jpeg_destination_mgr * dest = cinfo->dest;
+
+  *dest->next_output_byte++ = (JOCTET) val;
+  if (--dest->free_in_buffer == 0)
+    if (! (*dest->empty_output_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  INT32 temp;
+
+  /* Section D.1.8: Termination of encoding */
+
+  /* Find the e->c in the coding interval with the largest
+   * number of trailing zero bits */
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+    e->c = temp + 0x8000L;
+  else
+    e->c = temp;
+  /* Send remaining bytes to output */
+  e->c <<= e->ct;
+  if (e->c & 0xF8000000L) {
+    /* One final overflow has to be handled */
+    if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer + 1, cinfo);
+      if (e->buffer + 1 == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+    e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+    e->sc = 0;
+  } else {
+    if (e->buffer == 0)
+      ++e->zc;
+    else if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer, cinfo);
+    }
+    if (e->sc) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      do {
+	emit_byte(0xFF, cinfo);
+	emit_byte(0x00, cinfo);
+      } while (--e->sc);
+    }
+  }
+  /* Output final bytes only if they are not 0x00 */
+  if (e->c & 0x7FFF800L) {
+    if (e->zc)  /* output final pending zero bytes */
+      do emit_byte(0x00, cinfo);
+      while (--e->zc);
+    emit_byte((e->c >> 19) & 0xFF, cinfo);
+    if (((e->c >> 19) & 0xFF) == 0xFF)
+      emit_byte(0x00, cinfo);
+    if (e->c & 0x7F800L) {
+      emit_byte((e->c >> 11) & 0xFF, cinfo);
+      if (((e->c >> 11) & 0xFF) == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+  }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) 
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv;
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+  e->a -= qe;
+  if (val != (sv >> 7)) {
+    /* Encode the less probable symbol */
+    if (e->a >= qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency, otherwise code the LPS
+       * as usual: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+  } else {
+    /* Encode the more probable symbol */
+    if (e->a >= 0x8000L)
+      return;  /* A >= 0x8000 -> ready, no renormalization required */
+    if (e->a < qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+  }
+
+  /* Renormalization & data output per section D.1.6 */
+  do {
+    e->a <<= 1;
+    e->c <<= 1;
+    if (--e->ct == 0) {
+      /* Another byte is ready for output */
+      temp = e->c >> 19;
+      if (temp > 0xFF) {
+	/* Handle overflow over all stacked 0xFF bytes */
+	if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer + 1, cinfo);
+	  if (e->buffer + 1 == 0xFF)
+	    emit_byte(0x00, cinfo);
+	}
+	e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+	e->sc = 0;
+	/* Note: The 3 spacer bits in the C register guarantee
+	 * that the new buffer byte can't be 0xFF here
+	 * (see page 160 in the P&M JPEG book). */
+	e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+      } else if (temp == 0xFF) {
+	++e->sc;  /* stack 0xFF byte (which might overflow later) */
+      } else {
+	/* Output all stacked 0xFF bytes, they will not overflow any more */
+	if (e->buffer == 0)
+	  ++e->zc;
+	else if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer, cinfo);
+	}
+	if (e->sc) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  do {
+	    emit_byte(0xFF, cinfo);
+	    emit_byte(0x00, cinfo);
+	  } while (--e->sc);
+	}
+	e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+      }
+      e->c &= 0x7FFFFL;
+      e->ct += 8;
+    }
+  } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart (j_compress_ptr cinfo, int restart_num)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  finish_pass(cinfo);
+
+  emit_byte(0xFF, cinfo);
+  emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl;
+  int v, v2, m;
+  ISHIFT_TEMPS
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = m - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = m;
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Figure F.5: Encode_AC_Coefficients */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 0);		/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 0);
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 1);
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+    st += 2;
+    /* Figure F.8: Encoding the magnitude category of v */
+    m = 0;
+    if (v -= 1) {
+      arith_encode(cinfo, st, 1);
+      m = 1;
+      v2 = v;
+      if (v2 >>= 1) {
+	arith_encode(cinfo, st, 1);
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+    }
+    arith_encode(cinfo, st, 0);
+    /* Figure F.9: Encoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      arith_encode(cinfo, st, (m & v) ? 1 : 0);
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int Al, blkn;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  Al = cinfo->Al;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke, kex;
+  int v;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Section G.1.3.3: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = ke; kex > 0; kex--)
+    if ((v = (*block)[natural_order[kex]]) >= 0) {
+      if (v >>= cinfo->Ah) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Ah) break;
+    }
+
+  /* Figure G.10: Encode_AC_Coefficients_SA */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 0);
+	  }
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 1);
+	  }
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = (*block)[0];
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+
+    /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Establish EOB (end-of-block) index */
+    for (ke = cinfo->lim_Se; ke > 0; ke--)
+      if ((*block)[natural_order[ke]]) break;
+
+    /* Figure F.5: Encode_AC_Coefficients */
+    for (k = 1; k <= ke; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+      while ((v = (*block)[natural_order[k]]) == 0) {
+	arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      }
+      arith_encode(cinfo, st + 1, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, entropy->fixed_bin, 0);
+      } else {
+	v = -v;
+	arith_encode(cinfo, entropy->fixed_bin, 1);
+      }
+      st += 2;
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	if (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (v2 >>= 1) {
+	    arith_encode(cinfo, st, 1);
+	    m <<= 1;
+	    st += 1;
+	  }
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+    /* Encode EOB decision only if k <= cinfo->lim_Se */
+    if (k <= cinfo->lim_Se) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 1);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    /* Make sure to avoid that in the master control logic!
+     * We are fully adaptive here and need no extra
+     * statistics gathering pass!
+     */
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+
+  /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+  /* Select execution routines */
+  if (cinfo->progressive_mode) {
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+    }
+  } else
+    entropy->pub.encode_mcu = encode_mcu;
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+      if (cinfo->progressive_mode)
+	/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+	cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+    }
+  }
+
+  /* Initialize arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_encoder));
+  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+  entropy->pub.finish_pass = finish_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+}
diff --git a/jpeg/jccoefct.c b/jpeg/jccoefct.c
index 1963ddb61..d775313b8 100644
--- a/jpeg/jccoefct.c
+++ b/jpeg/jccoefct.c
@@ -149,6 +149,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   int blkn, bi, ci, yindex, yoffset, blockcnt;
   JDIMENSION ypos, xpos;
   jpeg_component_info *compptr;
+  forward_DCT_ptr forward_DCT;
 
   /* Loop to write as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -167,17 +168,19 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
+	forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
 	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						: compptr->last_col_width;
 	xpos = MCU_col_num * compptr->MCU_sample_width;
-	ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+	ypos = yoffset * compptr->DCT_v_scaled_size;
+	/* ypos == (yoffset+yindex) * DCTSIZE */
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (coef->iMCU_row_num < last_iMCU_row ||
 	      yoffset+yindex < compptr->last_row_height) {
-	    (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-					 input_buf[compptr->component_index],
-					 coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt);
+	    (*forward_DCT) (cinfo, compptr,
+			    input_buf[compptr->component_index],
+			    coef->MCU_buffer[blkn],
+			    ypos, xpos, (JDIMENSION) blockcnt);
 	    if (blockcnt < compptr->MCU_width) {
 	      /* Create some dummy blocks at the right edge of the image. */
 	      jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
@@ -195,7 +198,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  ypos += DCTSIZE;
+	  ypos += compptr->DCT_v_scaled_size;
 	}
       }
       /* Try to write the MCU.  In event of a suspension failure, we will
@@ -252,6 +255,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   jpeg_component_info *compptr;
   JBLOCKARRAY buffer;
   JBLOCKROW thisblockrow, lastblockrow;
+  forward_DCT_ptr forward_DCT;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -274,15 +278,15 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     ndummy = (int) (blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
+    forward_DCT = cinfo->fdct->forward_DCT[ci];
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
      * on forward_DCT processes a complete horizontal row of DCT blocks.
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-				   input_buf[ci], thisblockrow,
-				   (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across);
+      (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
+		      (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
+		      (JDIMENSION) 0, blocks_across);
       if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
 	thisblockrow += blocks_across; /* => first dummy block */
diff --git a/jpeg/jcdctmgr.c b/jpeg/jcdctmgr.c
index 61fa79b9e..0bbdbb685 100644
--- a/jpeg/jcdctmgr.c
+++ b/jpeg/jcdctmgr.c
@@ -23,7 +23,7 @@ typedef struct {
   struct jpeg_forward_dct pub;	/* public fields */
 
   /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr do_dct[MAX_COMPONENTS];
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -33,7 +33,7 @@ typedef struct {
 
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr do_float_dct[MAX_COMPONENTS];
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;
@@ -41,6 +41,132 @@ typedef struct {
 typedef my_fdct_controller * my_fdct_ptr;
 
 
+/* The current scaled-DCT routines require ISLOW-style divisor tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
+ */
+#ifdef DCT_ISLOW_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef DCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#endif
+#endif
+
+
+/*
+ * Perform forward DCT on one or more blocks of a component.
+ *
+ * The input samples are taken from the sample_data[] array starting at
+ * position start_row/start_col, and moving to the right for any additional
+ * blocks. The quantized coefficients are returned in coef_blocks[].
+ */
+
+METHODDEF(void)
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
+	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+	     JDIMENSION start_row, JDIMENSION start_col,
+	     JDIMENSION num_blocks)
+/* This version is used for integer DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index];
+  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
+  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register DCTELEM temp, qval;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	qval = divisors[i];
+	temp = workspace[i];
+	/* Divide the coefficient value by qval, ensuring proper rounding.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 *
+	 * In most files, at least half of the output values will be zero
+	 * (at default quantization settings, more like three-quarters...)
+	 * so we should ensure that this case is fast.  On many machines,
+	 * a comparison is enough cheaper than a divide to make a special test
+	 * a win.  Since both inputs will be nonnegative, we need only test
+	 * for a < b to discover whether a/b is 0.
+	 * If your machine's division is fast enough, define FAST_DIVIDE.
+	 */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a,b)	a /= b
+#else
+#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
+#endif
+	if (temp < 0) {
+	  temp = -temp;
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	  temp = -temp;
+	} else {
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	}
+	output_ptr[i] = (JCOEF) temp;
+      }
+    }
+  }
+}
+
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
+		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		   JDIMENSION start_row, JDIMENSION start_col,
+		   JDIMENSION num_blocks)
+/* This version is used for floating-point DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index];
+  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register FAST_FLOAT temp;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	/* Apply the quantization and scaling factor */
+	temp = workspace[i] * divisors[i];
+	/* Round to nearest integer.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 * The maximum coefficient size is +-16K (for 12-bit data), so this
+	 * code should work for either 16-bit or 32-bit ints.
+	 */
+	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+      }
+    }
+  }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
+
+
 /*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
@@ -56,11 +182,170 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
+  int method = 0;
   JQUANT_TBL * qtbl;
   DCTELEM * dtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    /* Select the proper DCT routine for this component's scaling */
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
+#ifdef DCT_SCALING_SUPPORTED
+    case ((1 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_1x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_2x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_3x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_4x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_5x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_6x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_7x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      fdct->do_dct[ci] = jpeg_fdct_9x9;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_10x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      fdct->do_dct[ci] = jpeg_fdct_11x11;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_12x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      fdct->do_dct[ci] = jpeg_fdct_13x13;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_14x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      fdct->do_dct[ci] = jpeg_fdct_15x15;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_16x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_16x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_14x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_12x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_10x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_8x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_6x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_4x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_2x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_8x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_7x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_6x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_5x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_4x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_3x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_2x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_1x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+#endif
+    case ((DCTSIZE << 8) + DCTSIZE):
+      switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+      case JDCT_ISLOW:
+	fdct->do_dct[ci] = jpeg_fdct_islow;
+	method = JDCT_ISLOW;
+	break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+      case JDCT_IFAST:
+	fdct->do_dct[ci] = jpeg_fdct_ifast;
+	method = JDCT_IFAST;
+	break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+      case JDCT_FLOAT:
+	fdct->do_float_dct[ci] = jpeg_fdct_float;
+	method = JDCT_FLOAT;
+	break;
+#endif
+      default:
+	ERREXIT(cinfo, JERR_NOT_COMPILED);
+	break;
+      }
+      break;
+    default:
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
+      break;
+    }
     qtblno = compptr->quant_tbl_no;
     /* Make sure specified quantization table is present */
     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
@@ -69,8 +354,8 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
     qtbl = cinfo->quant_tbl_ptrs[qtblno];
     /* Compute divisors for this quant table */
     /* We may do this more than once for same table, but it's not a big deal */
-    switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
+    switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
     case JDCT_ISLOW:
       /* For LL&M IDCT method, divisors are equal to raw quantization
        * coefficients multiplied by 8 (to counteract scaling).
@@ -84,6 +369,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       for (i = 0; i < DCTSIZE2; i++) {
 	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
@@ -122,6 +408,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 		    CONST_BITS-3);
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
@@ -158,6 +445,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	  }
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT_float;
       break;
 #endif
     default:
@@ -168,175 +456,6 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 }
 
 
-/*
- * Perform forward DCT on one or more blocks of a component.
- *
- * The input samples are taken from the sample_data[] array starting at
- * position start_row/start_col, and moving to the right for any additional
- * blocks. The quantized coefficients are returned in coef_blocks[].
- */
-
-METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
-	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
-/* This version is used for integer DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
-  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
-  }
-}
-
-
-#ifdef DCT_FLOAT_SUPPORTED
-
-METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
-/* This version is used for floating-point DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
-  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
-  }
-}
-
-#endif /* DCT_FLOAT_SUPPORTED */
-
-
 /*
  * Initialize FDCT manager.
  */
@@ -353,30 +472,6 @@ jinit_forward_dct (j_compress_ptr cinfo)
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
-  switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
-  case JDCT_ISLOW:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  case JDCT_IFAST:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  case JDCT_FLOAT:
-    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
-    break;
-  }
-
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     fdct->divisors[i] = NULL;
diff --git a/jpeg/jchuff.c b/jpeg/jchuff.c
index f23525054..257d7aa1f 100644
--- a/jpeg/jchuff.c
+++ b/jpeg/jchuff.c
@@ -2,22 +2,48 @@
  * jchuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy encoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting output suspension.
  * If the data destination module demands suspension, we want to be able to
  * back up to the start of the current MCU.  To do this, we copy state
  * variables into local working storage, and update them back to the
  * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * We do not support output suspension for the progressive JPEG mode, since
+ * the library currently does not allow multiple-scan files to be written
+ * with output suspension.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jcphuff.c */
+
+
+/* The legal range of a DCT coefficient is
+ *  -1024 .. +1023  for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+  unsigned int ehufco[256];	/* code for each symbol */
+  char ehufsi[256];		/* length of code for each symbol */
+  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
 
 
 /* Expanded entropy encoder object for Huffman encoding.
@@ -65,15 +91,32 @@ typedef struct {
   c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
 
-#ifdef ENTROPY_OPT_SUPPORTED	/* Statistics tables for optimization */
+  /* Statistics tables for optimization */
   long * dc_count_ptrs[NUM_HUFF_TBLS];
   long * ac_count_ptrs[NUM_HUFF_TBLS];
-#endif
+
+  /* Following fields used only in progressive mode */
+
+  /* Mode flag: TRUE for optimization, FALSE for actual data output */
+  boolean gather_statistics;
+
+  /* next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
+   */
+  JOCTET * next_output_byte;	/* => next byte to write in buffer */
+  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
+  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
+
+  /* Coding status for AC components */
+  int ac_tbl_no;		/* the table number of the single component */
+  unsigned int EOBRUN;		/* run length of EOBs */
+  unsigned int BE;		/* # of buffered correction bits before MCU */
+  char * bit_buffer;		/* buffer for correction bits (1 per char) */
+  /* packing correction bits tightly would save some space but cost time... */
 } huff_entropy_encoder;
 
 typedef huff_entropy_encoder * huff_entropy_ptr;
 
-/* Working state while writing an MCU.
+/* Working state while writing an MCU (sequential mode).
  * This struct contains all the fields that are needed by subroutines.
  */
 
@@ -84,98 +127,37 @@ typedef struct {
   j_compress_ptr cinfo;		/* dump_buffer needs access to this */
 } working_state;
 
+/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
+ * buffer can hold.  Larger sizes may slightly improve compression, but
+ * 1000 is already well into the realm of overkill.
+ * The minimum safe size is 64 bits.
+ */
 
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
-					JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
-#ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
-					  JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
-#endif
-
+#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
 
-/*
- * Initialize for a Huffman-compressed scan.
- * If gather_statistics is TRUE, we do not output anything during the scan,
- * just count the Huffman symbols used and generate Huffman code tables.
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
  */
 
-METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-    entropy->pub.encode_mcu = encode_mcu_gather;
-    entropy->pub.finish_pass = finish_pass_gather;
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
 #else
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
 #endif
-  } else {
-    entropy->pub.encode_mcu = encode_mcu_huff;
-    entropy->pub.finish_pass = finish_pass_huff;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-      /* Check for invalid table indexes */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
-      if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->dc_count_ptrs[dctbl] == NULL)
-	entropy->dc_count_ptrs[dctbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
-      if (entropy->ac_count_ptrs[actbl] == NULL)
-	entropy->ac_count_ptrs[actbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
-#endif
-    } else {
-      /* Compute derived values for Huffman tables */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-			      & entropy->dc_derived_tbls[dctbl]);
-      jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-			      & entropy->ac_derived_tbls[actbl]);
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jcphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 			 c_derived_tbl ** pdtbl)
 {
@@ -264,18 +246,27 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 }
 
 
-/* Outputting bytes to the file */
+/* Outputting bytes to the file.
+ * NB: these must be called only when actually outputting,
+ * that is, entropy->gather_statistics == FALSE.
+ */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
+#define emit_byte_s(state,val,action)  \
 	{ *(state)->next_output_byte++ = (JOCTET) (val);  \
 	  if (--(state)->free_in_buffer == 0)  \
-	    if (! dump_buffer(state))  \
+	    if (! dump_buffer_s(state))  \
 	      { action; } }
 
+/* Emit a byte */
+#define emit_byte_e(entropy,val)  \
+	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
+	  if (--(entropy)->free_in_buffer == 0)  \
+	    dump_buffer_e(entropy); }
+
 
 LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer_s (working_state * state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr * dest = state->cinfo->dest;
@@ -289,6 +280,20 @@ dump_buffer (working_state * state)
 }
 
 
+LOCAL(void)
+dump_buffer_e (huff_entropy_ptr entropy)
+/* Empty the output buffer; we do not support suspension in this case. */
+{
+  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+
+  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
+  /* After a successful buffer dump, must reset buffer pointers */
+  entropy->next_output_byte = dest->next_output_byte;
+  entropy->free_in_buffer = dest->free_in_buffer;
+}
+
+
 /* Outputting bits to the file */
 
 /* Only the right 24 bits of put_buffer are used; the valid bits are
@@ -299,7 +304,7 @@ dump_buffer (working_state * state)
 
 INLINE
 LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
+emit_bits_s (working_state * state, unsigned int code, int size)
 /* Emit some bits; return TRUE if successful, FALSE if must suspend */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
@@ -321,9 +326,9 @@ emit_bits (working_state * state, unsigned int code, int size)
   while (put_bits >= 8) {
     int c = (int) ((put_buffer >> 16) & 0xFF);
     
-    emit_byte(state, c, return FALSE);
+    emit_byte_s(state, c, return FALSE);
     if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(state, 0, return FALSE);
+      emit_byte_s(state, 0, return FALSE);
     }
     put_buffer <<= 8;
     put_bits -= 8;
@@ -336,17 +341,575 @@ emit_bits (working_state * state, unsigned int code, int size)
 }
 
 
+INLINE
+LOCAL(void)
+emit_bits_e (huff_entropy_ptr entropy, unsigned int code, int size)
+/* Emit some bits, unless we are in gather mode */
+{
+  /* This routine is heavily used, so it's worth coding tightly. */
+  register INT32 put_buffer = (INT32) code;
+  register int put_bits = entropy->saved.put_bits;
+
+  /* if size is 0, caller used an invalid Huffman table entry */
+  if (size == 0)
+    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+  if (entropy->gather_statistics)
+    return;			/* do nothing if we're only getting stats */
+
+  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
+  
+  put_bits += size;		/* new number of bits in buffer */
+
+  put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+  /* and merge with old buffer contents */
+  put_buffer |= entropy->saved.put_buffer;
+
+  while (put_bits >= 8) {
+    int c = (int) ((put_buffer >> 16) & 0xFF);
+
+    emit_byte_e(entropy, c);
+    if (c == 0xFF) {		/* need to stuff a zero byte? */
+      emit_byte_e(entropy, 0);
+    }
+    put_buffer <<= 8;
+    put_bits -= 8;
+  }
+
+  entropy->saved.put_buffer = put_buffer; /* update variables */
+  entropy->saved.put_bits = put_bits;
+}
+
+
 LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits_s (working_state * state)
 {
-  if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
+  if (! emit_bits_s(state, 0x7F, 7)) /* fill any partial byte with ones */
     return FALSE;
-  state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
+  state->cur.put_buffer = 0;	     /* and reset bit-buffer to empty */
   state->cur.put_bits = 0;
   return TRUE;
 }
 
 
+LOCAL(void)
+flush_bits_e (huff_entropy_ptr entropy)
+{
+  emit_bits_e(entropy, 0x7F, 7); /* fill any partial byte with ones */
+  entropy->saved.put_buffer = 0; /* and reset bit-buffer to empty */
+  entropy->saved.put_bits = 0;
+}
+
+
+/*
+ * Emit (or just count) a Huffman symbol.
+ */
+
+INLINE
+LOCAL(void)
+emit_dc_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+  if (entropy->gather_statistics)
+    entropy->dc_count_ptrs[tbl_no][symbol]++;
+  else {
+    c_derived_tbl * tbl = entropy->dc_derived_tbls[tbl_no];
+    emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+  }
+}
+
+
+INLINE
+LOCAL(void)
+emit_ac_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+  if (entropy->gather_statistics)
+    entropy->ac_count_ptrs[tbl_no][symbol]++;
+  else {
+    c_derived_tbl * tbl = entropy->ac_derived_tbls[tbl_no];
+    emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+  }
+}
+
+
+/*
+ * Emit bits from a correction bit buffer.
+ */
+
+LOCAL(void)
+emit_buffered_bits (huff_entropy_ptr entropy, char * bufstart,
+		    unsigned int nbits)
+{
+  if (entropy->gather_statistics)
+    return;			/* no real work */
+
+  while (nbits > 0) {
+    emit_bits_e(entropy, (unsigned int) (*bufstart), 1);
+    bufstart++;
+    nbits--;
+  }
+}
+
+
+/*
+ * Emit any pending EOBRUN symbol.
+ */
+
+LOCAL(void)
+emit_eobrun (huff_entropy_ptr entropy)
+{
+  register int temp, nbits;
+
+  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
+    temp = entropy->EOBRUN;
+    nbits = 0;
+    while ((temp >>= 1))
+      nbits++;
+    /* safety check: shouldn't happen given limited correction-bit buffer */
+    if (nbits > 14)
+      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
+    if (nbits)
+      emit_bits_e(entropy, entropy->EOBRUN, nbits);
+
+    entropy->EOBRUN = 0;
+
+    /* Emit any buffered correction bits */
+    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart_s (working_state * state, int restart_num)
+{
+  int ci;
+
+  if (! flush_bits_s(state))
+    return FALSE;
+
+  emit_byte_s(state, 0xFF, return FALSE);
+  emit_byte_s(state, JPEG_RST0 + restart_num, return FALSE);
+
+  /* Re-initialize DC predictions to 0 */
+  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
+    state->cur.last_dc_val[ci] = 0;
+
+  /* The restart counter is not updated until we successfully write the MCU. */
+
+  return TRUE;
+}
+
+
+LOCAL(void)
+emit_restart_e (huff_entropy_ptr entropy, int restart_num)
+{
+  int ci;
+
+  emit_eobrun(entropy);
+
+  if (! entropy->gather_statistics) {
+    flush_bits_e(entropy);
+    emit_byte_e(entropy, 0xFF);
+    emit_byte_e(entropy, JPEG_RST0 + restart_num);
+  }
+
+  if (entropy->cinfo->Ss == 0) {
+    /* Re-initialize DC predictions to 0 */
+    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
+      entropy->saved.last_dc_val[ci] = 0;
+  } else {
+    /* Re-initialize all AC-related fields to 0 */
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  int blkn, ci;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+  jpeg_component_info * compptr;
+  ISHIFT_TEMPS
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+
+    /* DC differences are figured on the point-transformed values. */
+    temp = temp2 - entropy->saved.last_dc_val[ci];
+    entropy->saved.last_dc_val[ci] = temp2;
+
+    /* Encode the DC coefficient difference per section G.1.2.1 */
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      /* For a negative input, want temp2 = bitwise complement of abs(input) */
+      /* This code assumes we are on a two's complement machine */
+      temp2--;
+    }
+    
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    while (temp) {
+      nbits++;
+      temp >>= 1;
+    }
+    /* Check for out-of-range coefficient values.
+     * Since we're encoding a difference, the range limit is twice as much.
+     */
+    if (nbits > MAX_COEF_BITS+1)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+    
+    /* Count/emit the Huffman-coded symbol for the number of bits */
+    emit_dc_symbol(entropy, compptr->dc_tbl_no, nbits);
+    
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    if (nbits)			/* emit_bits rejects calls with size 0 */
+      emit_bits_e(entropy, (unsigned int) temp2, nbits);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  register int r, k;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  Se = cinfo->Se;
+  Al = cinfo->Al;
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+  
+  r = 0;			/* r = run length of zeros */
+   
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = (*block)[natural_order[k]]) == 0) {
+      r++;
+      continue;
+    }
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value; so the code is
+     * interwoven with finding the abs value (temp) and output bits (temp2).
+     */
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      temp >>= Al;		/* apply the point transform */
+      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;		/* apply the point transform */
+      temp2 = temp;
+    }
+    /* Watch out for case that nonzero coef is zero after point transform */
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any pending EOBRUN */
+    if (entropy->EOBRUN > 0)
+      emit_eobrun(entropy);
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      emit_ac_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+    }
+
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 1;			/* there must be at least one 1 bit */
+    while ((temp >>= 1))
+      nbits++;
+    /* Check for out-of-range coefficient values */
+    if (nbits > MAX_COEF_BITS)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    emit_bits_e(entropy, (unsigned int) temp2, nbits);
+
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0) {			/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    if (entropy->EOBRUN == 0x7FFF)
+      emit_eobrun(entropy);	/* force it out to avoid overflow */
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  int blkn;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    temp = (*block)[0];
+    emit_bits_e(entropy, (unsigned int) (temp >> Al), 1);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  register int r, k;
+  int EOB;
+  char *BR_buffer;
+  unsigned int BR;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+  int absvalues[DCTSIZE2];
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  Se = cinfo->Se;
+  Al = cinfo->Al;
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* It is convenient to make a pre-pass to determine the transformed
+   * coefficients' absolute values and the EOB position.
+   */
+  EOB = 0;
+  for (k = cinfo->Ss; k <= Se; k++) {
+    temp = (*block)[natural_order[k]];
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if (temp < 0)
+      temp = -temp;		/* temp is abs value of input */
+    temp >>= Al;		/* apply the point transform */
+    absvalues[k] = temp;	/* save abs value for main pass */
+    if (temp == 1)
+      EOB = k;			/* EOB = index of last newly-nonzero coef */
+  }
+
+  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
+  
+  r = 0;			/* r = run length of zeros */
+  BR = 0;			/* BR = count of buffered bits added now */
+  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
+
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = absvalues[k]) == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any required ZRLs, but not if they can be folded into EOB */
+    while (r > 15 && k <= EOB) {
+      /* emit any pending EOBRUN and the BE correction bits */
+      emit_eobrun(entropy);
+      /* Emit ZRL */
+      emit_ac_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+      /* Emit buffered correction bits that must be associated with ZRL */
+      emit_buffered_bits(entropy, BR_buffer, BR);
+      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+      BR = 0;
+    }
+
+    /* If the coef was previously nonzero, it only needs a correction bit.
+     * NOTE: a straight translation of the spec's figure G.7 would suggest
+     * that we also need to test r > 15.  But if r > 15, we can only get here
+     * if k > EOB, which implies that this coefficient is not 1.
+     */
+    if (temp > 1) {
+      /* The correction bit is the next bit of the absolute value. */
+      BR_buffer[BR++] = (char) (temp & 1);
+      continue;
+    }
+
+    /* Emit any pending EOBRUN and the BE correction bits */
+    emit_eobrun(entropy);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+
+    /* Emit output bit for newly-nonzero coef */
+    temp = ((*block)[natural_order[k]] < 0) ? 0 : 1;
+    emit_bits_e(entropy, (unsigned int) temp, 1);
+
+    /* Emit buffered correction bits that must be associated with this code */
+    emit_buffered_bits(entropy, BR_buffer, BR);
+    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+    BR = 0;
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    entropy->BE += BR;		/* concat my correction bits to older ones */
+    /* We force out the EOB if we risk either:
+     * 1. overflow of the EOB counter;
+     * 2. overflow of the correction bit buffer during the next MCU.
+     */
+    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+      emit_eobrun(entropy);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
@@ -356,9 +919,11 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
   register int temp, temp2;
   register int nbits;
   register int k, r, i;
-  
+  int Se = state->cinfo->lim_Se;
+  const int * natural_order = state->cinfo->natural_order;
+
   /* Encode the DC coefficient difference per section F.1.2.1 */
-  
+
   temp = temp2 = block[0] - last_dc_val;
 
   if (temp < 0) {
@@ -367,7 +932,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
     /* This code assumes we are on a two's complement machine */
     temp2--;
   }
-  
+
   /* Find the number of bits needed for the magnitude of the coefficient */
   nbits = 0;
   while (temp) {
@@ -379,28 +944,28 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
    */
   if (nbits > MAX_COEF_BITS+1)
     ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-  
+
   /* Emit the Huffman-coded symbol for the number of bits */
-  if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+  if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
     return FALSE;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
   if (nbits)			/* emit_bits rejects calls with size 0 */
-    if (! emit_bits(state, (unsigned int) temp2, nbits))
+    if (! emit_bits_s(state, (unsigned int) temp2, nbits))
       return FALSE;
 
   /* Encode the AC coefficients per section F.1.2.2 */
-  
+
   r = 0;			/* r = run length of zeros */
-  
-  for (k = 1; k < DCTSIZE2; k++) {
-    if ((temp = block[jpeg_natural_order[k]]) == 0) {
+
+  for (k = 1; k <= Se; k++) {
+    if ((temp = block[natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
       while (r > 15) {
-	if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
+	if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
 	  return FALSE;
 	r -= 16;
       }
@@ -411,7 +976,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 	/* This code assumes we are on a two's complement machine */
 	temp2--;
       }
-      
+
       /* Find the number of bits needed for the magnitude of the coefficient */
       nbits = 1;		/* there must be at least one 1 bit */
       while ((temp >>= 1))
@@ -419,55 +984,30 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
       /* Check for out-of-range coefficient values */
       if (nbits > MAX_COEF_BITS)
 	ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-      
+
       /* Emit Huffman symbol for run length / number of bits */
       i = (r << 4) + nbits;
-      if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
+      if (! emit_bits_s(state, actbl->ehufco[i], actbl->ehufsi[i]))
 	return FALSE;
 
       /* Emit that number of bits of the value, if positive, */
       /* or the complement of its magnitude, if negative. */
-      if (! emit_bits(state, (unsigned int) temp2, nbits))
+      if (! emit_bits_s(state, (unsigned int) temp2, nbits))
 	return FALSE;
-      
+
       r = 0;
     }
   }
 
   /* If the last coef(s) were zero, emit an end-of-block code */
   if (r > 0)
-    if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
+    if (! emit_bits_s(state, actbl->ehufco[0], actbl->ehufsi[0]))
       return FALSE;
 
   return TRUE;
 }
 
 
-/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
-{
-  int ci;
-
-  if (! flush_bits(state))
-    return FALSE;
-
-  emit_byte(state, 0xFF, return FALSE);
-  emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
-    state->cur.last_dc_val[ci] = 0;
-
-  /* The restart counter is not updated until we successfully write the MCU. */
-
-  return TRUE;
-}
-
-
 /*
  * Encode and output one MCU's worth of Huffman-compressed coefficients.
  */
@@ -489,7 +1029,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (! emit_restart_s(&state, entropy->next_restart_num))
 	return FALSE;
   }
 
@@ -535,20 +1075,32 @@ finish_pass_huff (j_compress_ptr cinfo)
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
 
-  /* Load up working state ... flush_bits needs it */
-  state.next_output_byte = cinfo->dest->next_output_byte;
-  state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
-  state.cinfo = cinfo;
+  if (cinfo->progressive_mode) {
+    entropy->next_output_byte = cinfo->dest->next_output_byte;
+    entropy->free_in_buffer = cinfo->dest->free_in_buffer;
 
-  /* Flush out the last data */
-  if (! flush_bits(&state))
-    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+    /* Flush out any buffered data */
+    emit_eobrun(entropy);
+    flush_bits_e(entropy);
 
-  /* Update state */
-  cinfo->dest->next_output_byte = state.next_output_byte;
-  cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+    cinfo->dest->next_output_byte = entropy->next_output_byte;
+    cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+  } else {
+    /* Load up working state ... flush_bits needs it */
+    state.next_output_byte = cinfo->dest->next_output_byte;
+    state.free_in_buffer = cinfo->dest->free_in_buffer;
+    ASSIGN_STATE(state.cur, entropy->saved);
+    state.cinfo = cinfo;
+
+    /* Flush out the last data */
+    if (! flush_bits_s(&state))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+    /* Update state */
+    cinfo->dest->next_output_byte = state.next_output_byte;
+    cinfo->dest->free_in_buffer = state.free_in_buffer;
+    ASSIGN_STATE(entropy->saved, state.cur);
+  }
 }
 
 
@@ -563,8 +1115,6 @@ finish_pass_huff (j_compress_ptr cinfo)
  * the compressed data.
  */
 
-#ifdef ENTROPY_OPT_SUPPORTED
-
 
 /* Process a single block's worth of coefficients */
 
@@ -575,6 +1125,8 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   register int temp;
   register int nbits;
   register int k, r;
+  int Se = cinfo->lim_Se;
+  const int * natural_order = cinfo->natural_order;
   
   /* Encode the DC coefficient difference per section F.1.2.1 */
   
@@ -601,8 +1153,8 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   
   r = 0;			/* r = run length of zeros */
   
-  for (k = 1; k < DCTSIZE2; k++) {
-    if ((temp = block[jpeg_natural_order[k]]) == 0) {
+  for (k = 1; k <= Se; k++) {
+    if ((temp = block[natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
@@ -675,7 +1227,6 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 /*
  * Generate the best Huffman code table for the given counts, fill htbl.
- * Note this is also used by jcphuff.c.
  *
  * The JPEG standard requires that no symbol be assigned a codeword of all
  * one bits (so that padding bits added at the end of a compressed segment
@@ -701,7 +1252,7 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  * So the extra complexity of an optimal algorithm doesn't seem worthwhile.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
 {
 #define MAX_CLEN 32		/* assumed maximum initial code length */
@@ -846,7 +1397,7 @@ METHODDEF(void)
 finish_pass_gather (j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
+  int ci, tbl;
   jpeg_component_info * compptr;
   JHUFF_TBL **htblptr;
   boolean did_dc[NUM_HUFF_TBLS];
@@ -855,32 +1406,147 @@ finish_pass_gather (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
+  if (cinfo->progressive_mode)
+    /* Flush out buffered data (all we care about is counting the EOB symbol) */
+    emit_eobrun(entropy);
+
   MEMZERO(did_dc, SIZEOF(did_dc));
   MEMZERO(did_ac, SIZEOF(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
-      did_dc[dctbl] = TRUE;
-    }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
-      did_ac[actbl] = TRUE;
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (! did_dc[tbl]) {
+	htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[tbl]);
+	did_dc[tbl] = TRUE;
+      }
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (! did_ac[tbl]) {
+	htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[tbl]);
+	did_ac[tbl] = TRUE;
+      }
     }
   }
 }
 
 
-#endif /* ENTROPY_OPT_SUPPORTED */
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    entropy->pub.finish_pass = finish_pass_gather;
+  else
+    entropy->pub.finish_pass = finish_pass_huff;
+
+  if (cinfo->progressive_mode) {
+    entropy->cinfo = cinfo;
+    entropy->gather_statistics = gather_statistics;
+
+    /* We assume jcmaster.c already validated the scan parameters. */
+
+    /* Select execution routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else {
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+	/* AC refinement needs a correction bit buffer */
+	if (entropy->bit_buffer == NULL)
+	  entropy->bit_buffer = (char *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					MAX_CORR_BITS * SIZEOF(char));
+      }
+    }
+
+    /* Initialize AC stuff */
+    entropy->ac_tbl_no = cinfo->cur_comp_info[0]->ac_tbl_no;
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  } else {
+    if (gather_statistics)
+      entropy->pub.encode_mcu = encode_mcu_gather;
+    else
+      entropy->pub.encode_mcu = encode_mcu_huff;
+  }
+
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (gather_statistics) {
+	/* Check for invalid table index */
+	/* (make_c_derived_tbl does this in the other path) */
+	if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+	/* Allocate and zero the statistics tables */
+	/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+	if (entropy->dc_count_ptrs[tbl] == NULL)
+	  entropy->dc_count_ptrs[tbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->dc_count_ptrs[tbl], 257 * SIZEOF(long));
+      } else {
+	/* Compute derived values for Huffman tables */
+	/* We may do this more than once for a table, but it's not expensive */
+	jpeg_make_c_derived_tbl(cinfo, TRUE, tbl,
+				& entropy->dc_derived_tbls[tbl]);
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (gather_statistics) {
+	if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+	if (entropy->ac_count_ptrs[tbl] == NULL)
+	  entropy->ac_count_ptrs[tbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->ac_count_ptrs[tbl], 257 * SIZEOF(long));
+      } else {
+	jpeg_make_c_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->ac_derived_tbls[tbl]);
+      }
+    }
+  }
+
+  /* Initialize bit buffer to empty */
+  entropy->saved.put_buffer = 0;
+  entropy->saved.put_bits = 0;
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
 
 
 /*
@@ -902,8 +1568,9 @@ jinit_huff_encoder (j_compress_ptr cinfo)
   /* Mark tables unallocated */
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
-#ifdef ENTROPY_OPT_SUPPORTED
     entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
-#endif
   }
+
+  if (cinfo->progressive_mode)
+    entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
 }
diff --git a/jpeg/jchuff.h b/jpeg/jchuff.h
deleted file mode 100644
index a9599fc1e..000000000
--- a/jpeg/jchuff.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * jchuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy encoding routines
- * that are shared between the sequential encoder (jchuff.c) and the
- * progressive encoder (jcphuff.c).  No other modules need to see these.
- */
-
-/* The legal range of a DCT coefficient is
- *  -1024 .. +1023  for 8-bit data;
- * -16384 .. +16383 for 12-bit data.
- * Hence the magnitude should always fit in 10 or 14 bits respectively.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
-#else
-#define MAX_COEF_BITS 14
-#endif
-
-/* Derived data constructed for each Huffman table */
-
-typedef struct {
-  unsigned int ehufco[256];	/* code for each symbol */
-  char ehufsi[256];		/* length of code for each symbol */
-  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
-} c_derived_tbl;
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_c_derived_tbl	jMkCDerived
-#define jpeg_gen_optimal_table	jGenOptTbl
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-	JPP((j_compress_ptr cinfo, boolean isDC, int tblno,
-	     c_derived_tbl ** pdtbl));
-
-/* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-	JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
diff --git a/jpeg/jcinit.c b/jpeg/jcinit.c
index 5efffe331..0ba310f21 100644
--- a/jpeg/jcinit.c
+++ b/jpeg/jcinit.c
@@ -41,17 +41,10 @@ jinit_compress_master (j_compress_ptr cinfo)
   /* Forward DCT */
   jinit_forward_dct(cinfo);
   /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
+    jinit_huff_encoder(cinfo);
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
diff --git a/jpeg/jcmainct.c b/jpeg/jcmainct.c
index a0d82683f..7de75d167 100644
--- a/jpeg/jcmainct.c
+++ b/jpeg/jcmainct.c
@@ -68,32 +68,32 @@ METHODDEF(void) process_data_buffer_main
 METHODDEF(void)
 start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
   /* Do nothing in raw-data mode. */
   if (cinfo->raw_data_in)
     return;
 
-  jmain->cur_iMCU_row = 0;	/* initialize counters */
-  jmain->rowgroup_ctr = 0;
-  jmain->suspended = FALSE;
-  jmain->pass_mode = pass_mode;	/* save mode for use by process_data */
+  main->cur_iMCU_row = 0;	/* initialize counters */
+  main->rowgroup_ctr = 0;
+  main->suspended = FALSE;
+  main->pass_mode = pass_mode;	/* save mode for use by process_data */
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    if (jmain->whole_image[0] != NULL)
+    if (main->whole_image[0] != NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
-    jmain->pub.process_data = process_data_simple_main;
+    main->pub.process_data = process_data_simple_main;
     break;
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
   case JBUF_SAVE_SOURCE:
   case JBUF_CRANK_DEST:
   case JBUF_SAVE_AND_PASS:
-    if (jmain->whole_image[0] == NULL)
+    if (main->whole_image[0] == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    jmain->pub.process_data = process_data_buffer_main;
+    main->pub.process_data = process_data_buffer_main;
     break;
 #endif
   default:
@@ -114,46 +114,46 @@ process_data_simple_main (j_compress_ptr cinfo,
 			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
 			  JDIMENSION in_rows_avail)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
-  while (jmain->cur_iMCU_row < cinfo->total_iMCU_rows) {
+  while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
-    if (jmain->rowgroup_ctr < DCTSIZE)
+    if (main->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       (*cinfo->prep->pre_process_data) (cinfo,
 					input_buf, in_row_ctr, in_rows_avail,
-					jmain->buffer, &jmain->rowgroup_ctr,
-					(JDIMENSION) DCTSIZE);
+					main->buffer, &main->rowgroup_ctr,
+					(JDIMENSION) cinfo->min_DCT_v_scaled_size);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
      * at the bottom of the image.
      */
-    if (jmain->rowgroup_ctr != DCTSIZE)
+    if (main->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       return;
 
     /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, jmain->buffer)) {
+    if (! (*cinfo->coef->compress_data) (cinfo, main->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
        * it happened to be the last row of the image, the application would
        * think we were done.
        */
-      if (! jmain->suspended) {
+      if (! main->suspended) {
 	(*in_row_ctr)--;
-	jmain->suspended = TRUE;
+	main->suspended = TRUE;
       }
       return;
     }
     /* We did finish the row.  Undo our little suspension hack if a previous
      * call suspended; then mark the main buffer empty.
      */
-    if (jmain->suspended) {
+    if (main->suspended) {
       (*in_row_ctr)++;
-      jmain->suspended = FALSE;
+      main->suspended = FALSE;
     }
-    jmain->rowgroup_ctr = 0;
-    jmain->cur_iMCU_row++;
+    main->rowgroup_ctr = 0;
+    main->cur_iMCU_row++;
   }
 }
 
@@ -170,25 +170,25 @@ process_data_buffer_main (j_compress_ptr cinfo,
 			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
 			  JDIMENSION in_rows_avail)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci;
   jpeg_component_info *compptr;
-  boolean writing = (jmain->pass_mode != JBUF_CRANK_DEST);
+  boolean writing = (main->pass_mode != JBUF_CRANK_DEST);
 
-  while (jmain->cur_iMCU_row < cinfo->total_iMCU_rows) {
+  while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Realign the virtual buffers if at the start of an iMCU row. */
-    if (jmain->rowgroup_ctr == 0) {
+    if (main->rowgroup_ctr == 0) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	   ci++, compptr++) {
-	jmain->buffer[ci] = (*cinfo->mem->access_virt_sarray)
-	  ((j_common_ptr) cinfo, jmain->whole_image[ci],
-	   jmain->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
+	main->buffer[ci] = (*cinfo->mem->access_virt_sarray)
+	  ((j_common_ptr) cinfo, main->whole_image[ci],
+	   main->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
 	   (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
       }
       /* In a read pass, pretend we just read some source data. */
       if (! writing) {
 	*in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
-	jmain->rowgroup_ctr = DCTSIZE;
+	main->rowgroup_ctr = DCTSIZE;
       }
     }
 
@@ -197,40 +197,40 @@ process_data_buffer_main (j_compress_ptr cinfo,
     if (writing) {
       (*cinfo->prep->pre_process_data) (cinfo,
 					input_buf, in_row_ctr, in_rows_avail,
-					jmain->buffer, &jmain->rowgroup_ctr,
+					main->buffer, &main->rowgroup_ctr,
 					(JDIMENSION) DCTSIZE);
       /* Return to application if we need more data to fill the iMCU row. */
-      if (jmain->rowgroup_ctr < DCTSIZE)
+      if (main->rowgroup_ctr < DCTSIZE)
 	return;
     }
 
     /* Emit data, unless this is a sink-only pass. */
-    if (jmain->pass_mode != JBUF_SAVE_SOURCE) {
-      if (! (*cinfo->coef->compress_data) (cinfo, jmain->buffer)) {
+    if (main->pass_mode != JBUF_SAVE_SOURCE) {
+      if (! (*cinfo->coef->compress_data) (cinfo, main->buffer)) {
 	/* If compressor did not consume the whole row, then we must need to
 	 * suspend processing and return to the application.  In this situation
 	 * we pretend we didn't yet consume the last input row; otherwise, if
 	 * it happened to be the last row of the image, the application would
 	 * think we were done.
 	 */
-	if (! jmain->suspended) {
+	if (! main->suspended) {
 	  (*in_row_ctr)--;
-	  jmain->suspended = TRUE;
+	  main->suspended = TRUE;
 	}
 	return;
       }
       /* We did finish the row.  Undo our little suspension hack if a previous
        * call suspended; then mark the main buffer empty.
        */
-      if (jmain->suspended) {
+      if (main->suspended) {
 	(*in_row_ctr)++;
-	jmain->suspended = FALSE;
+	main->suspended = FALSE;
       }
     }
 
     /* If get here, we are done with this iMCU row.  Mark buffer empty. */
-    jmain->rowgroup_ctr = 0;
-    jmain->cur_iMCU_row++;
+    main->rowgroup_ctr = 0;
+    main->cur_iMCU_row++;
   }
 }
 
@@ -244,15 +244,15 @@ process_data_buffer_main (j_compress_ptr cinfo,
 GLOBAL(void)
 jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr jmain;
+  my_main_ptr main;
   int ci;
   jpeg_component_info *compptr;
 
-  jmain = (my_main_ptr)
+  main = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) jmain;
-  jmain->pub.start_pass = start_pass_main;
+  cinfo->main = (struct jpeg_c_main_controller *) main;
+  main->pub.start_pass = start_pass_main;
 
   /* We don't need to create a buffer in raw-data mode. */
   if (cinfo->raw_data_in)
@@ -267,27 +267,27 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     /* Note we pad the bottom to a multiple of the iMCU height */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
-      jmain->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
+      main->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 compptr->width_in_blocks * DCTSIZE,
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
 				(long) compptr->v_samp_factor) * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
   } else {
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    jmain->whole_image[0] = NULL; /* flag for no virtual arrays */
+    main->whole_image[0] = NULL; /* flag for no virtual arrays */
 #endif
     /* Allocate a strip buffer for each component */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
-      jmain->buffer[ci] = (*cinfo->mem->alloc_sarray)
+      main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 compptr->width_in_blocks * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
   }
 }
diff --git a/jpeg/jcmarker.c b/jpeg/jcmarker.c
index 3d1e6c6d5..2e2898342 100644
--- a/jpeg/jcmarker.c
+++ b/jpeg/jcmarker.c
@@ -2,6 +2,7 @@
  * jcmarker.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -153,21 +154,22 @@ emit_dqt (j_compress_ptr cinfo, int index)
     ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);
 
   prec = 0;
-  for (i = 0; i < DCTSIZE2; i++) {
-    if (qtbl->quantval[i] > 255)
+  for (i = 0; i <= cinfo->lim_Se; i++) {
+    if (qtbl->quantval[cinfo->natural_order[i]] > 255)
       prec = 1;
   }
 
   if (! qtbl->sent_table) {
     emit_marker(cinfo, M_DQT);
 
-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo,
+      prec ? cinfo->lim_Se * 2 + 2 + 1 + 2 : cinfo->lim_Se + 1 + 1 + 2);
 
     emit_byte(cinfo, index + (prec<<4));
 
-    for (i = 0; i < DCTSIZE2; i++) {
+    for (i = 0; i <= cinfo->lim_Se; i++) {
       /* The table entries must be emitted in zigzag order. */
-      unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
+      unsigned int qval = qtbl->quantval[cinfo->natural_order[i]];
       if (prec)
 	emit_byte(cinfo, (int) (qval >> 8));
       emit_byte(cinfo, (int) (qval & 0xFF));
@@ -235,8 +237,12 @@ emit_dac (j_compress_ptr cinfo)
   
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
-    dc_in_use[compptr->dc_tbl_no] = 1;
-    ac_in_use[compptr->ac_tbl_no] = 1;
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0)
+      dc_in_use[compptr->dc_tbl_no] = 1;
+    /* AC needs no table when not present */
+    if (cinfo->Se)
+      ac_in_use[compptr->ac_tbl_no] = 1;
   }
   
   length = 0;
@@ -285,13 +291,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->jpeg_height > 65535L ||
+      (long) cinfo->jpeg_width > 65535L)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->image_height);
-  emit_2bytes(cinfo, (int) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -320,22 +326,16 @@ emit_sos (j_compress_ptr cinfo)
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
     emit_byte(cinfo, compptr->component_id);
-    td = compptr->dc_tbl_no;
-    ta = compptr->ac_tbl_no;
-    if (cinfo->progressive_mode) {
-      /* Progressive mode: only DC or only AC tables are used in one scan;
-       * furthermore, Huffman coding of DC refinement uses no table at all.
-       * We emit 0 for unused field(s); this is recommended by the P&M text
-       * but does not seem to be specified in the standard.
-       */
-      if (cinfo->Ss == 0) {
-	ta = 0;			/* DC scan */
-	if (cinfo->Ah != 0 && !cinfo->arith_code)
-	  td = 0;		/* no DC table either */
-      } else {
-	td = 0;			/* AC scan */
-      }
-    }
+
+    /* We emit 0 for unused field(s); this is recommended by the P&M text
+     * but does not seem to be specified in the standard.
+     */
+
+    /* DC needs no table for refinement scan */
+    td = cinfo->Ss == 0 && cinfo->Ah == 0 ? compptr->dc_tbl_no : 0;
+    /* AC needs no table when not present */
+    ta = cinfo->Se ? compptr->ac_tbl_no : 0;
+
     emit_byte(cinfo, (td << 4) + ta);
   }
 
@@ -345,6 +345,22 @@ emit_sos (j_compress_ptr cinfo)
 }
 
 
+LOCAL(void)
+emit_pseudo_sos (j_compress_ptr cinfo)
+/* Emit a pseudo SOS marker */
+{
+  emit_marker(cinfo, M_SOS);
+  
+  emit_2bytes(cinfo, 2 + 1 + 3); /* length */
+  
+  emit_byte(cinfo, 0); /* Ns */
+
+  emit_byte(cinfo, 0); /* Ss */
+  emit_byte(cinfo, cinfo->block_size * cinfo->block_size - 1); /* Se */
+  emit_byte(cinfo, 0); /* Ah/Al */
+}
+
+
 LOCAL(void)
 emit_jfif_app0 (j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
@@ -484,7 +500,7 @@ write_file_header (j_compress_ptr cinfo)
 
 /*
  * Write frame header.
- * This consists of DQT and SOFn markers.
+ * This consists of DQT and SOFn markers, and a conditional pseudo SOS marker.
  * Note that we do not emit the SOF until we have emitted the DQT(s).
  * This avoids compatibility problems with incorrect implementations that
  * try to error-check the quant table numbers as soon as they see the SOF.
@@ -511,7 +527,7 @@ write_frame_header (j_compress_ptr cinfo)
    * Note we assume that Huffman table numbers won't be changed later.
    */
   if (cinfo->arith_code || cinfo->progressive_mode ||
-      cinfo->data_precision != 8) {
+      cinfo->data_precision != 8 || cinfo->block_size != DCTSIZE) {
     is_baseline = FALSE;
   } else {
     is_baseline = TRUE;
@@ -529,7 +545,10 @@ write_frame_header (j_compress_ptr cinfo)
 
   /* Emit the proper SOF marker */
   if (cinfo->arith_code) {
-    emit_sof(cinfo, M_SOF9);	/* SOF code for arithmetic coding */
+    if (cinfo->progressive_mode)
+      emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+    else
+      emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
   } else {
     if (cinfo->progressive_mode)
       emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
@@ -538,6 +557,10 @@ write_frame_header (j_compress_ptr cinfo)
     else
       emit_sof(cinfo, M_SOF1);	/* SOF code for non-baseline Huffman file */
   }
+
+  /* Check to emit pseudo SOS marker */
+  if (cinfo->progressive_mode && cinfo->block_size != DCTSIZE)
+    emit_pseudo_sos(cinfo);
 }
 
 
@@ -566,19 +589,12 @@ write_scan_header (j_compress_ptr cinfo)
      */
     for (i = 0; i < cinfo->comps_in_scan; i++) {
       compptr = cinfo->cur_comp_info[i];
-      if (cinfo->progressive_mode) {
-	/* Progressive mode: only DC or only AC tables are used in one scan */
-	if (cinfo->Ss == 0) {
-	  if (cinfo->Ah == 0)	/* DC needs no table for refinement scan */
-	    emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
-	} else {
-	  emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
-	}
-      } else {
-	/* Sequential mode: need both DC and AC tables */
+      /* DC needs no table for refinement scan */
+      if (cinfo->Ss == 0 && cinfo->Ah == 0)
 	emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
+      /* AC needs no table when not present */
+      if (cinfo->Se)
 	emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
-      }
     }
   }
 
diff --git a/jpeg/jcmaster.c b/jpeg/jcmaster.c
index aab4020b8..660883f45 100644
--- a/jpeg/jcmaster.c
+++ b/jpeg/jcmaster.c
@@ -2,6 +2,7 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -42,23 +43,200 @@ typedef my_comp_master * my_master_ptr;
  * Support routines that do various essential calculations.
  */
 
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+#ifdef DCT_SCALING_SUPPORTED
+
+  /* Compute actual JPEG image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num >= cinfo->scale_denom * 8) {
+    /* Provide 8/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 3;
+    cinfo->jpeg_height = cinfo->image_height << 3;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num >= cinfo->scale_denom * 4) {
+    /* Provide 4/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 2;
+    cinfo->jpeg_height = cinfo->image_height << 2;
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 8) {
+    /* Provide 8/3 scaling */
+    cinfo->jpeg_width = (cinfo->image_width << 1) + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2, 3L);
+    cinfo->jpeg_height = (cinfo->image_height << 1) + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2, 3L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num >= cinfo->scale_denom * 2) {
+    /* Provide 2/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width << 1;
+    cinfo->jpeg_height = cinfo->image_height << 1;
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 8) {
+    /* Provide 8/5 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3, 5L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3, 5L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 4) {
+    /* Provide 4/3 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 3L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 3L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 8) {
+    /* Provide 8/7 scaling */
+    cinfo->jpeg_width = cinfo->image_width + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 7L);
+    cinfo->jpeg_height = cinfo->image_height + (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 7L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num >= cinfo->scale_denom) {
+    /* Provide 1/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width;
+    cinfo->jpeg_height = cinfo->image_height;
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * 8) {
+    /* Provide 8/9 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 9L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 9L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 4) {
+    /* Provide 4/5 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4, 5L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4, 5L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * 8) {
+    /* Provide 8/11 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 11L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 11L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 2) {
+    /* Provide 2/3 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2, 3L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2, 3L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * 8) {
+    /* Provide 8/13 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 13L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 13L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 4) {
+    /* Provide 4/7 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4, 7L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4, 7L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * 8) {
+    /* Provide 8/15 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8, 15L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8, 15L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 1/2 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, 2L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, 2L);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
+LOCAL(void)
+jpeg_calc_trans_dimensions (j_compress_ptr cinfo)
+{
+  if (cinfo->min_DCT_h_scaled_size < 1 || cinfo->min_DCT_h_scaled_size > 16
+      || cinfo->min_DCT_h_scaled_size != cinfo->min_DCT_v_scaled_size)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	     cinfo->min_DCT_h_scaled_size, cinfo->min_DCT_v_scaled_size);
+
+  cinfo->block_size = cinfo->min_DCT_h_scaled_size;
+
+  switch (cinfo->block_size) {
+  case 2: cinfo->natural_order = jpeg_natural_order2; break;
+  case 3: cinfo->natural_order = jpeg_natural_order3; break;
+  case 4: cinfo->natural_order = jpeg_natural_order4; break;
+  case 5: cinfo->natural_order = jpeg_natural_order5; break;
+  case 6: cinfo->natural_order = jpeg_natural_order6; break;
+  case 7: cinfo->natural_order = jpeg_natural_order7; break;
+  default: cinfo->natural_order = jpeg_natural_order; break;
+  }
+
+  cinfo->lim_Se = cinfo->block_size < DCTSIZE ?
+    cinfo->block_size * cinfo->block_size - 1 : DCTSIZE2-1;
+}
+
+
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo)
+initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
-  int ci;
+  int ci, ssize;
   jpeg_component_info *compptr;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
 
+  if (transcode_only)
+    jpeg_calc_trans_dimensions(cinfo);
+  else
+    jpeg_calc_jpeg_dimensions(cinfo);
+
   /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
@@ -95,22 +273,52 @@ initial_setup (j_compress_ptr cinfo)
        ci++, compptr++) {
     /* Fill in the correct component_index value; don't rely on application */
     compptr->component_index = ci;
-    /* For compression, we never do DCT scaling. */
-    compptr->DCT_scaled_size = DCTSIZE;
+    /* In selecting the actual DCT scaling for each component, we try to
+     * scale down the chroma components via DCT scaling rather than downsampling.
+     * This saves time if the downsampler gets to use 1:1 scaling.
+     * Note this code adapts subsampling ratios which are powers of 2.
+     */
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support DCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
+
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor,
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor,
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_width *
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_height *
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -119,8 +327,8 @@ initial_setup (j_compress_ptr cinfo)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long) cinfo->jpeg_height,
+		  (long) (cinfo->max_v_samp_factor * cinfo->block_size));
 }
 
 
@@ -260,6 +468,39 @@ validate_script (j_compress_ptr cinfo)
   }
 }
 
+
+LOCAL(void)
+reduce_script (j_compress_ptr cinfo)
+/* Adapt scan script for use with reduced block size;
+ * assume that script has been validated before.
+ */
+{
+  jpeg_scan_info * scanptr;
+  int idxout, idxin;
+
+  /* Circumvent const declaration for this function */
+  scanptr = (jpeg_scan_info *) cinfo->scan_info;
+  idxout = 0;
+
+  for (idxin = 0; idxin < cinfo->num_scans; idxin++) {
+    /* After skipping, idxout becomes smaller than idxin */
+    if (idxin != idxout)
+      /* Copy rest of data;
+       * note we stay in given chunk of allocated memory.
+       */
+      scanptr[idxout] = scanptr[idxin];
+    if (scanptr[idxout].Ss > cinfo->lim_Se)
+      /* Entire scan out of range - skip this entry */
+      continue;
+    if (scanptr[idxout].Se > cinfo->lim_Se)
+      /* Limit scan to end of block */
+      scanptr[idxout].Se = cinfo->lim_Se;
+    idxout++;
+  }
+
+  cinfo->num_scans = idxout;
+}
+
 #endif /* C_MULTISCAN_FILES_SUPPORTED */
 
 
@@ -280,10 +521,13 @@ select_scan_parameters (j_compress_ptr cinfo)
       cinfo->cur_comp_info[ci] =
 	&cinfo->comp_info[scanptr->component_index[ci]];
     }
-    cinfo->Ss = scanptr->Ss;
-    cinfo->Se = scanptr->Se;
-    cinfo->Ah = scanptr->Ah;
-    cinfo->Al = scanptr->Al;
+    if (cinfo->progressive_mode) {
+      cinfo->Ss = scanptr->Ss;
+      cinfo->Se = scanptr->Se;
+      cinfo->Ah = scanptr->Ah;
+      cinfo->Al = scanptr->Al;
+      return;
+    }
   }
   else
 #endif
@@ -296,11 +540,11 @@ select_scan_parameters (j_compress_ptr cinfo)
     for (ci = 0; ci < cinfo->num_components; ci++) {
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
-    cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
-    cinfo->Ah = 0;
-    cinfo->Al = 0;
   }
+  cinfo->Ss = 0;
+  cinfo->Se = cinfo->block_size * cinfo->block_size - 1;
+  cinfo->Ah = 0;
+  cinfo->Al = 0;
 }
 
 
@@ -325,7 +569,7 @@ per_scan_setup (j_compress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = DCTSIZE;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -347,11 +591,11 @@ per_scan_setup (j_compress_ptr cinfo)
     
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width,
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height,
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     
     cinfo->blocks_in_MCU = 0;
     
@@ -361,7 +605,7 @@ per_scan_setup (j_compress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -433,7 +677,7 @@ prepare_for_pass (j_compress_ptr cinfo)
     /* Do Huffman optimization for a scan after the first one. */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+    if (cinfo->Ss != 0 || cinfo->Ah == 0) {
       (*cinfo->entropy->start_pass) (cinfo, TRUE);
       (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
       master->pub.call_pass_startup = FALSE;
@@ -554,11 +798,13 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   master->pub.is_last_pass = FALSE;
 
   /* Validate parameters, determine derived values */
-  initial_setup(cinfo);
+  initial_setup(cinfo, transcode_only);
 
   if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
     validate_script(cinfo);
+    if (cinfo->block_size < DCTSIZE)
+      reduce_script(cinfo);
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
@@ -567,8 +813,10 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode)	/*  TEMPORARY HACK ??? */
-    cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
+  if ((cinfo->progressive_mode || cinfo->block_size < DCTSIZE) &&
+      !cinfo->arith_code)			/*  TEMPORARY HACK ??? */
+    /* assume default tables no good for progressive or downscale mode */
+    cinfo->optimize_coding = TRUE;
 
   /* Initialize my private state */
   if (transcode_only) {
diff --git a/jpeg/jconfig.doc b/jpeg/jconfig.doc
deleted file mode 100644
index c18d1c064..000000000
--- a/jpeg/jconfig.doc
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * jconfig.doc
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file documents the configuration options that are required to
- * customize the JPEG software for a particular system.
- *
- * The actual configuration options for a particular installation are stored
- * in jconfig.h.  On many machines, jconfig.h can be generated automatically
- * or copied from one of the "canned" jconfig files that we supply.  But if
- * you need to generate a jconfig.h file by hand, this file tells you how.
- *
- * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
- * EDIT A COPY NAMED JCONFIG.H.
- */
-
-
-/*
- * These symbols indicate the properties of your machine or compiler.
- * #define the symbol if yes, #undef it if no.
- */
-
-/* Does your compiler support function prototypes?
- * (If not, you also need to use ansi2knr, see install.doc)
- */
-#define HAVE_PROTOTYPES
-
-/* Does your compiler support the declaration "unsigned char" ?
- * How about "unsigned short" ?
- */
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-
-/* Define "void" as "char" if your compiler doesn't know about type void.
- * NOTE: be sure to define void such that "void *" represents the most general
- * pointer type, e.g., that returned by malloc().
- */
-/* #define void char */
-
-/* Define "const" as empty if your compiler doesn't know the "const" keyword.
- */
-/* #define const */
-
-/* Define this if an ordinary "char" type is unsigned.
- * If you're not sure, leaving it undefined will work at some cost in speed.
- * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
- */
-#undef CHAR_IS_UNSIGNED
-
-/* Define this if your system has an ANSI-conforming <stddef.h> file.
- */
-#define HAVE_STDDEF_H
-
-/* Define this if your system has an ANSI-conforming <stdlib.h> file.
- */
-#define HAVE_STDLIB_H
-
-/* Define this if your system does not have an ANSI/SysV <string.h>,
- * but does have a BSD-style <strings.h>.
- */
-#undef NEED_BSD_STRINGS
-
-/* Define this if your system does not provide typedef size_t in any of the
- * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
- * <sys/types.h> instead.
- */
-#undef NEED_SYS_TYPES_H
-
-/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
- * unless you are using a large-data memory model or 80386 flat-memory mode.
- * On less brain-damaged CPUs this symbol must not be defined.
- * (Defining this symbol causes large data structures to be referenced through
- * "far" pointers and to be allocated with a special version of malloc.)
- */
-#undef NEED_FAR_POINTERS
-
-/* Define this if your linker needs global names to be unique in less
- * than the first 15 characters.
- */
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-/* Although a real ANSI C compiler can deal perfectly well with pointers to
- * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
- * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
- * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
- * actually get "missing structure definition" warnings or errors while
- * compiling the JPEG code.
- */
-#undef INCOMPLETE_TYPES_BROKEN
-
-
-/*
- * The following options affect code selection within the JPEG library,
- * but they don't need to be visible to applications using the library.
- * To minimize application namespace pollution, the symbols won't be
- * defined unless JPEG_INTERNALS has been defined.
- */
-
-#ifdef JPEG_INTERNALS
-
-/* Define this if your compiler implements ">>" on signed values as a logical
- * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
- * which is the normal and rational definition.
- */
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-
-#endif /* JPEG_INTERNALS */
-
-
-/*
- * The remaining options do not affect the JPEG library proper,
- * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
- * Other applications can ignore these.
- */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-/* These defines indicate which image (non-JPEG) file formats are allowed. */
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-/* Define this if you want to name both input and output files on the command
- * line, rather than using stdout and optionally stdin.  You MUST do this if
- * your system can't cope with binary I/O to stdin/stdout.  See comments at
- * head of cjpeg.c or djpeg.c.
- */
-#undef TWO_FILE_COMMANDLINE
-
-/* Define this if your system needs explicit cleanup of temporary files.
- * This is crucial under MS-DOS, where the temporary "files" may be areas
- * of extended memory; on most other systems it's not as important.
- */
-#undef NEED_SIGNAL_CATCHER
-
-/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
- * This is necessary on systems that distinguish text files from binary files,
- * and is harmless on most systems that don't.  If you have one of the rare
- * systems that complains about the "b" spec, define this symbol.
- */
-#undef DONT_USE_B_MODE
-
-/* Define this if you want percent-done progress reports from cjpeg/djpeg.
- */
-#undef PROGRESS_REPORT
-
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/jconfig.txt b/jpeg/jconfig.txt
new file mode 100644
index 000000000..b96d31249
--- /dev/null
+++ b/jpeg/jconfig.txt
@@ -0,0 +1,164 @@
+/*
+ * jconfig.txt
+ *
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file documents the configuration options that are required to
+ * customize the JPEG software for a particular system.
+ *
+ * The actual configuration options for a particular installation are stored
+ * in jconfig.h.  On many machines, jconfig.h can be generated automatically
+ * or copied from one of the "canned" jconfig files that we supply.  But if
+ * you need to generate a jconfig.h file by hand, this file tells you how.
+ *
+ * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
+ * EDIT A COPY NAMED JCONFIG.H.
+ */
+
+
+/*
+ * These symbols indicate the properties of your machine or compiler.
+ * #define the symbol if yes, #undef it if no.
+ */
+
+/* Does your compiler support function prototypes?
+ * (If not, you also need to use ansi2knr, see install.txt)
+ */
+#define HAVE_PROTOTYPES
+
+/* Does your compiler support the declaration "unsigned char" ?
+ * How about "unsigned short" ?
+ */
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+
+/* Define "void" as "char" if your compiler doesn't know about type void.
+ * NOTE: be sure to define void such that "void *" represents the most general
+ * pointer type, e.g., that returned by malloc().
+ */
+/* #define void char */
+
+/* Define "const" as empty if your compiler doesn't know the "const" keyword.
+ */
+/* #define const */
+
+/* Define this if an ordinary "char" type is unsigned.
+ * If you're not sure, leaving it undefined will work at some cost in speed.
+ * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
+ */
+#undef CHAR_IS_UNSIGNED
+
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
+#define HAVE_STDDEF_H
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
+#define HAVE_STDLIB_H
+
+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
+#undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
+#undef NEED_SYS_TYPES_H
+
+/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
+ * unless you are using a large-data memory model or 80386 flat-memory mode.
+ * On less brain-damaged CPUs this symbol must not be defined.
+ * (Defining this symbol causes large data structures to be referenced through
+ * "far" pointers and to be allocated with a special version of malloc.)
+ */
+#undef NEED_FAR_POINTERS
+
+/* Define this if your linker needs global names to be unique in less
+ * than the first 15 characters.
+ */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, on Windows systems.
+ */
+#ifdef _WIN32
+#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
+#endif
+
+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+
+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+
+#endif /* JPEG_INTERNALS */
+
+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
+#ifdef JPEG_CJPEG_DJPEG
+
+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
+#undef TWO_FILE_COMMANDLINE
+
+/* Define this if your system needs explicit cleanup of temporary files.
+ * This is crucial under MS-DOS, where the temporary "files" may be areas
+ * of extended memory; on most other systems it's not as important.
+ */
+#undef NEED_SIGNAL_CATCHER
+
+/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
+#undef DONT_USE_B_MODE
+
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
+#undef PROGRESS_REPORT
+
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/jcparam.c b/jpeg/jcparam.c
index 6fc48f536..c5e85dda5 100644
--- a/jpeg/jcparam.c
+++ b/jpeg/jcparam.c
@@ -2,6 +2,7 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -60,6 +61,47 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+		       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+		       cinfo->q_scale_factor[1], force_baseline);
+}
+
+
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 			 boolean force_baseline)
@@ -69,31 +111,6 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  * applications that insist on a linear percentage scaling.
  */
 {
-  /* These are the sample quantization tables given in JPEG spec section K.1.
-   * The spec says that the values given produce "good" quality, and
-   * when divided by 2, "very good" quality.
-   */
-  static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-    16,  11,  10,  16,  24,  40,  51,  61,
-    12,  12,  14,  19,  26,  58,  60,  55,
-    14,  13,  16,  24,  40,  57,  69,  56,
-    14,  17,  22,  29,  51,  87,  80,  62,
-    18,  22,  37,  56,  68, 109, 103,  77,
-    24,  35,  55,  64,  81, 104, 113,  92,
-    49,  64,  78,  87, 103, 121, 120, 101,
-    72,  92,  95,  98, 112, 100, 103,  99
-  };
-  static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
-  };
-
   /* Set up two quantization tables using the specified scaling */
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
 		       scale_factor, force_baseline);
@@ -284,6 +301,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   /* Initialize everything not dependent on the color space */
 
+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
   cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +339,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* By default, use the simpler non-cosited sampling alignment */
   cinfo->CCIR601_sampling = FALSE;
 
+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
 
diff --git a/jpeg/jcphuff.c b/jpeg/jcphuff.c
deleted file mode 100644
index 07f9178b0..000000000
--- a/jpeg/jcphuff.c
+++ /dev/null
@@ -1,833 +0,0 @@
-/*
- * jcphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy encoding routines for progressive JPEG.
- *
- * We do not support output suspension in this module, since the library
- * currently does not allow multiple-scan files to be written with output
- * suspension.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jchuff.c */
-
-#ifdef C_PROGRESSIVE_SUPPORTED
-
-/* Expanded entropy encoder object for progressive Huffman encoding. */
-
-typedef struct {
-  struct jpeg_entropy_encoder pub; /* public fields */
-
-  /* Mode flag: TRUE for optimization, FALSE for actual data output */
-  boolean gather_statistics;
-
-  /* Bit-level coding status.
-   * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
-   */
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
-  INT32 put_buffer;		/* current bit-accumulation buffer */
-  int put_bits;			/* # of bits now in it */
-  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
-
-  /* Coding status for DC components */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-
-  /* Coding status for AC components */
-  int ac_tbl_no;		/* the table number of the single component */
-  unsigned int EOBRUN;		/* run length of EOBs */
-  unsigned int BE;		/* # of buffered correction bits before MCU */
-  char * bit_buffer;		/* buffer for correction bits (1 per char) */
-  /* packing correction bits tightly would save some space but cost time... */
-
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
-
-  /* Pointers to derived tables (these workspaces have image lifespan).
-   * Since any one scan codes only DC or only AC, we only need one set
-   * of tables, not one for DC and one for AC.
-   */
-  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  /* Statistics tables for optimization; again, one set is enough */
-  long * count_ptrs[NUM_HUFF_TBLS];
-} phuff_entropy_encoder;
-
-typedef phuff_entropy_encoder * phuff_entropy_ptr;
-
-/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
- * buffer can hold.  Larger sizes may slightly improve compression, but
- * 1000 is already well into the realm of overkill.
- * The minimum safe size is 64 bits.
- */
-
-#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
-
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
- * which should be safe.
- */
-
-#ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-	((ishift_temp = (x)) < 0 ? \
-	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-	 (ishift_temp >> (shft)))
-#else
-#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
-#endif
-
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_DC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_phuff JPP((j_compress_ptr cinfo));
-METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo));
-
-
-/*
- * Initialize for a Huffman-compressed scan using progressive JPEG.
- */
-
-METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
-{  
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-
-  entropy->cinfo = cinfo;
-  entropy->gather_statistics = gather_statistics;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* We assume jcmaster.c already validated the scan parameters. */
-
-  /* Select execution routines */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_first;
-    else
-      entropy->pub.encode_mcu = encode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_refine;
-    else {
-      entropy->pub.encode_mcu = encode_mcu_AC_refine;
-      /* AC refinement needs a correction bit buffer */
-      if (entropy->bit_buffer == NULL)
-	entropy->bit_buffer = (char *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      MAX_CORR_BITS * SIZEOF(char));
-    }
-  }
-  if (gather_statistics)
-    entropy->pub.finish_pass = finish_pass_gather_phuff;
-  else
-    entropy->pub.finish_pass = finish_pass_phuff;
-
-  /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
-   * for AC coefficients.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Initialize DC predictions to 0 */
-    entropy->last_dc_val[ci] = 0;
-    /* Get table index */
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
-    }
-    if (gather_statistics) {
-      /* Check for invalid table index */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
-        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->count_ptrs[tbl] == NULL)
-	entropy->count_ptrs[tbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
-    } else {
-      /* Compute derived values for Huffman table */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-			      & entropy->derived_tbls[tbl]);
-    }
-  }
-
-  /* Initialize AC stuff */
-  entropy->EOBRUN = 0;
-  entropy->BE = 0;
-
-  /* Initialize bit buffer to empty */
-  entropy->put_buffer = 0;
-  entropy->put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
-
-
-/* Outputting bytes to the file.
- * NB: these must be called only when actually outputting,
- * that is, entropy->gather_statistics == FALSE.
- */
-
-/* Emit a byte */
-#define emit_byte(entropy,val)  \
-	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-	  if (--(entropy)->free_in_buffer == 0)  \
-	    dump_buffer(entropy); }
-
-
-LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
-/* Empty the output buffer; we do not support suspension in this module. */
-{
-  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
-
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
-    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
-  /* After a successful buffer dump, must reset buffer pointers */
-  entropy->next_output_byte = dest->next_output_byte;
-  entropy->free_in_buffer = dest->free_in_buffer;
-}
-
-
-/* Outputting bits to the file */
-
-/* Only the right 24 bits of put_buffer are used; the valid bits are
- * left-justified in this part.  At most 16 bits can be passed to emit_bits
- * in one call, and we never retain more than 7 bits in put_buffer
- * between calls, so 24 bits are sufficient.
- */
-
-INLINE
-LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
-/* Emit some bits, unless we are in gather mode */
-{
-  /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
-  register int put_bits = entropy->put_bits;
-
-  /* if size is 0, caller used an invalid Huffman table entry */
-  if (size == 0)
-    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-  if (entropy->gather_statistics)
-    return;			/* do nothing if we're only getting stats */
-
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-  
-  put_bits += size;		/* new number of bits in buffer */
-  
-  put_buffer <<= 24 - put_bits; /* align incoming bits */
-
-  put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
-
-  while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
-    
-    emit_byte(entropy, c);
-    if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(entropy, 0);
-    }
-    put_buffer <<= 8;
-    put_bits -= 8;
-  }
-
-  entropy->put_buffer = put_buffer; /* update variables */
-  entropy->put_bits = put_bits;
-}
-
-
-LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
-{
-  emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
-  entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
-  entropy->put_bits = 0;
-}
-
-
-/*
- * Emit (or just count) a Huffman symbol.
- */
-
-INLINE
-LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
-{
-  if (entropy->gather_statistics)
-    entropy->count_ptrs[tbl_no][symbol]++;
-  else {
-    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
-    emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
-  }
-}
-
-
-/*
- * Emit bits from a correction bit buffer.
- */
-
-LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart,
-		    unsigned int nbits)
-{
-  if (entropy->gather_statistics)
-    return;			/* no real work */
-
-  while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
-    bufstart++;
-    nbits--;
-  }
-}
-
-
-/*
- * Emit any pending EOBRUN symbol.
- */
-
-LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
-{
-  register int temp, nbits;
-
-  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
-    temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
-    /* safety check: shouldn't happen given limited correction-bit buffer */
-    if (nbits > 14)
-      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-    emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
-    if (nbits)
-      emit_bits(entropy, entropy->EOBRUN, nbits);
-
-    entropy->EOBRUN = 0;
-
-    /* Emit any buffered correction bits */
-    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
-{
-  int ci;
-
-  emit_eobrun(entropy);
-
-  if (! entropy->gather_statistics) {
-    flush_bits(entropy);
-    emit_byte(entropy, 0xFF);
-    emit_byte(entropy, JPEG_RST0 + restart_num);
-  }
-
-  if (entropy->cinfo->Ss == 0) {
-    /* Re-initialize DC predictions to 0 */
-    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
-      entropy->last_dc_val[ci] = 0;
-  } else {
-    /* Re-initialize all AC-related fields to 0 */
-    entropy->EOBRUN = 0;
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * MCU encoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  int blkn, ci;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  jpeg_component_info * compptr;
-  ISHIFT_TEMPS
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-
-    /* Compute the DC value after the required point transform by Al.
-     * This is simply an arithmetic right shift.
-     */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
-
-    /* DC differences are figured on the point-transformed values. */
-    temp = temp2 - entropy->last_dc_val[ci];
-    entropy->last_dc_val[ci] = temp2;
-
-    /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
-    
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
-    /* Check for out-of-range coefficient values.
-     * Since we're encoding a difference, the range limit is twice as much.
-     */
-    if (nbits > MAX_COEF_BITS+1)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-    
-    /* Count/emit the Huffman-coded symbol for the number of bits */
-    emit_symbol(entropy, compptr->dc_tbl_no, nbits);
-    
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    if (nbits)			/* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  register int r, k;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
-  
-  r = 0;			/* r = run length of zeros */
-   
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
-      r++;
-      continue;
-    }
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value; so the code is
-     * interwoven with finding the abs value (temp) and output bits (temp2).
-     */
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      temp >>= Al;		/* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;		/* apply the point transform */
-      temp2 = temp;
-    }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any pending EOBRUN */
-    if (entropy->EOBRUN > 0)
-      emit_eobrun(entropy);
-    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-    while (r > 15) {
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-    }
-
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;			/* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
-    /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
-
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
-
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0) {			/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    if (entropy->EOBRUN == 0x7FFF)
-      emit_eobrun(entropy);	/* force it out to avoid overflow */
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  int blkn;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* We simply emit the Al'th bit of the DC coefficient value. */
-    temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  register int r, k;
-  int EOB;
-  char *BR_buffer;
-  unsigned int BR;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  int absvalues[DCTSIZE2];
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* It is convenient to make a pre-pass to determine the transformed
-   * coefficients' absolute values and the EOB position.
-   */
-  EOB = 0;
-  for (k = cinfo->Ss; k <= Se; k++) {
-    temp = (*block)[jpeg_natural_order[k]];
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value.
-     */
-    if (temp < 0)
-      temp = -temp;		/* temp is abs value of input */
-    temp >>= Al;		/* apply the point transform */
-    absvalues[k] = temp;	/* save abs value for main pass */
-    if (temp == 1)
-      EOB = k;			/* EOB = index of last newly-nonzero coef */
-  }
-
-  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
-  
-  r = 0;			/* r = run length of zeros */
-  BR = 0;			/* BR = count of buffered bits added now */
-  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
-
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = absvalues[k]) == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any required ZRLs, but not if they can be folded into EOB */
-    while (r > 15 && k <= EOB) {
-      /* emit any pending EOBRUN and the BE correction bits */
-      emit_eobrun(entropy);
-      /* Emit ZRL */
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-      /* Emit buffered correction bits that must be associated with ZRL */
-      emit_buffered_bits(entropy, BR_buffer, BR);
-      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-      BR = 0;
-    }
-
-    /* If the coef was previously nonzero, it only needs a correction bit.
-     * NOTE: a straight translation of the spec's figure G.7 would suggest
-     * that we also need to test r > 15.  But if r > 15, we can only get here
-     * if k > EOB, which implies that this coefficient is not 1.
-     */
-    if (temp > 1) {
-      /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
-      continue;
-    }
-
-    /* Emit any pending EOBRUN and the BE correction bits */
-    emit_eobrun(entropy);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
-
-    /* Emit output bit for newly-nonzero coef */
-    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
-
-    /* Emit buffered correction bits that must be associated with this code */
-    emit_buffered_bits(entropy, BR_buffer, BR);
-    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-    BR = 0;
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    entropy->BE += BR;		/* concat my correction bits to older ones */
-    /* We force out the EOB if we risk either:
-     * 1. overflow of the EOB counter;
-     * 2. overflow of the correction bit buffer during the next MCU.
-     */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
-      emit_eobrun(entropy);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * Finish up at the end of a Huffman-compressed progressive scan.
- */
-
-METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Flush out any buffered data */
-  emit_eobrun(entropy);
-  flush_bits(entropy);
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-}
-
-
-/*
- * Finish up a statistics-gathering pass and create the new Huffman tables.
- */
-
-METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-  JHUFF_TBL **htblptr;
-  boolean did[NUM_HUFF_TBLS];
-
-  /* Flush out buffered data (all we care about is counting the EOB symbol) */
-  emit_eobrun(entropy);
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* It's important not to apply jpeg_gen_optimal_table more than once
-   * per table, because it clobbers the input frequency counts!
-   */
-  MEMZERO(did, SIZEOF(did));
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      tbl = compptr->ac_tbl_no;
-    }
-    if (! did[tbl]) {
-      if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
-      else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
-      if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
-      did[tbl] = TRUE;
-    }
-  }
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy encoding.
- */
-
-GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff;
-
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-    entropy->count_ptrs[i] = NULL;
-  }
-  entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
-}
-
-#endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/jpeg/jcprepct.c b/jpeg/jcprepct.c
index fa93333db..be44cc4b4 100644
--- a/jpeg/jcprepct.c
+++ b/jpeg/jcprepct.c
@@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo,
 	*out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	   ci++, compptr++) {
+	numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
 	expand_bottom_edge(output_buf[ci],
-			   compptr->width_in_blocks * DCTSIZE,
-			   (int) (*out_row_group_ctr * compptr->v_samp_factor),
-			   (int) (out_row_groups_avail * compptr->v_samp_factor));
+			   compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+			   (int) (*out_row_group_ctr * numrows),
+			   (int) (out_row_groups_avail * numrows));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;			/* can exit outer loop without test */
@@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo)
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+       (JDIMENSION) (((long) compptr->width_in_blocks *
+		      cinfo->min_DCT_h_scaled_size *
 		      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION) (3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
@@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+	 (JDIMENSION) (((long) compptr->width_in_blocks *
+			cinfo->min_DCT_h_scaled_size *
 			cinfo->max_h_samp_factor) / compptr->h_samp_factor),
 	 (JDIMENSION) cinfo->max_v_samp_factor);
     }
diff --git a/jpeg/jcsample.c b/jpeg/jcsample.c
index 212ec8757..4d36f85f3 100644
--- a/jpeg/jcsample.c
+++ b/jpeg/jcsample.c
@@ -62,6 +62,15 @@ typedef struct {
 
   /* Downsampling method pointers, one per component */
   downsample1_ptr methods[MAX_COMPONENTS];
+
+  /* Height of an output row group for each component. */
+  int rowgroup_height[MAX_COMPONENTS];
+
+  /* These arrays save pixel expansion factors so that int_downsample need not
+   * recompute them each time.  They are unused for other downsampling methods.
+   */
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
 } my_downsampler;
 
 typedef my_downsampler * my_downsample_ptr;
@@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo,
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     in_ptr = input_buf[ci] + in_row_index;
-    out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+    out_ptr = output_buf[ci] +
+	      (out_row_group_index * downsample->rowgroup_height[ci]);
     (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
   }
 }
@@ -140,14 +150,15 @@ METHODDEF(void)
 int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;	/* outcol_h == outcol*h_expand */
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   JSAMPROW inptr, outptr;
   INT32 outvalue;
 
-  h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
-  v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+  h_expand = downsample->h_expand[compptr->component_index];
+  v_expand = downsample->v_expand[compptr->component_index];
   numpix = h_expand * v_expand;
   numpix2 = numpix/2;
 
@@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * h_expand);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     for (outcol = 0, outcol_h = 0; outcol < output_cols;
 	 outcol++, outcol_h += h_expand) {
@@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
+    outrow++;
   }
 }
 
@@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   jcopy_sample_rows(input_data, 0, output_data, 0,
 		    cinfo->max_v_samp_factor, cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+		    compptr->width_in_blocks * compptr->DCT_h_scaled_size);
 }
 
 
@@ -212,9 +224,9 @@ METHODDEF(void)
 h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, outptr;
   register int bias;
 
@@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
     bias = 0;			/* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
@@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, outptr;
   register int bias;
 
@@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       inptr0 += 2; inptr1 += 2;
     }
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
 
@@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
   neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
     *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
 
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -392,9 +406,9 @@ METHODDEF(void)
 fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
 			    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
@@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
   neighscale = cinfo->smoothing_factor * 64; /* scaled SF */
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
+    above_ptr = input_data[inrow-1];
+    below_ptr = input_data[inrow+1];
 
     /* Special case for first column */
     colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo)
   int ci;
   jpeg_component_info * compptr;
   boolean smoothok = TRUE;
+  int h_in_group, v_in_group, h_out_group, v_out_group;
 
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo)
   /* Verify we can handle the sampling factors, and set up method pointers */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
-	compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    /* Compute size of an "output group" for DCT scaling.  This many samples
+     * are to be converted from max_h_samp_factor * max_v_samp_factor pixels.
+     */
+    h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		  cinfo->min_DCT_h_scaled_size;
+    v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
+    h_in_group = cinfo->max_h_samp_factor;
+    v_in_group = cinfo->max_v_samp_factor;
+    downsample->rowgroup_height[ci] = v_out_group; /* save for use later */
+    if (h_in_group == h_out_group && v_in_group == v_out_group) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = fullsize_smooth_downsample;
@@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = fullsize_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group) {
       smoothok = FALSE;
       downsample->methods[ci] = h2v1_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group * 2) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = h2v2_smooth_downsample;
@@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = h2v2_downsample;
-    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
-	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+    } else if ((h_in_group % h_out_group) == 0 &&
+	       (v_in_group % v_out_group) == 0) {
       smoothok = FALSE;
       downsample->methods[ci] = int_downsample;
+      downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group);
+      downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
   }
diff --git a/jpeg/jctrans.c b/jpeg/jctrans.c
index 0e6d70769..cee6b0f34 100644
--- a/jpeg/jctrans.c
+++ b/jpeg/jctrans.c
@@ -2,6 +2,7 @@
  * jctrans.c
  *
  * Copyright (C) 1995-1998, Thomas G. Lane.
+ * Modified 2000-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -76,6 +77,10 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
   dstinfo->image_height = srcinfo->image_height;
   dstinfo->input_components = srcinfo->num_components;
   dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->jpeg_width = srcinfo->output_width;
+  dstinfo->jpeg_height = srcinfo->output_height;
+  dstinfo->min_DCT_h_scaled_size = srcinfo->min_DCT_h_scaled_size;
+  dstinfo->min_DCT_v_scaled_size = srcinfo->min_DCT_v_scaled_size;
   /* Initialize all parameters to default values */
   jpeg_set_defaults(dstinfo);
   /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
@@ -158,25 +163,14 @@ LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
 			      jvirt_barray_ptr * coef_arrays)
 {
-  /* Although we don't actually use input_components for transcoding,
-   * jcmaster.c's initial_setup will complain if input_components is 0.
-   */
-  cinfo->input_components = 1;
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, TRUE /* transcode only */);
 
   /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
+    jinit_huff_encoder(cinfo);
   }
 
   /* We need a special coefficient buffer controller. */
diff --git a/jpeg/jdapimin.c b/jpeg/jdapimin.c
index cadb59fce..7f1ce4c05 100644
--- a/jpeg/jdapimin.c
+++ b/jpeg/jdapimin.c
@@ -2,6 +2,7 @@
  * jdapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -185,8 +186,8 @@ default_decompress_parms (j_decompress_ptr cinfo)
   }
 
   /* Set defaults for other decompression parameters. */
-  cinfo->scale_num = 1;		/* 1:1 scaling */
-  cinfo->scale_denom = 1;
+  cinfo->scale_num = cinfo->block_size;		/* 1:1 scaling */
+  cinfo->scale_denom = cinfo->block_size;
   cinfo->output_gamma = 1.0;
   cinfo->buffered_image = FALSE;
   cinfo->raw_data_out = FALSE;
diff --git a/jpeg/jdapistd.c b/jpeg/jdapistd.c
index c8e3fa0c3..9d7453777 100644
--- a/jpeg/jdapistd.c
+++ b/jpeg/jdapistd.c
@@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
   }
 
   /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size;
   if (max_lines < lines_per_iMCU_row)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
diff --git a/jpeg/jdarith.c b/jpeg/jdarith.c
new file mode 100644
index 000000000..c858b248b
--- /dev/null
+++ b/jpeg/jdarith.c
@@ -0,0 +1,772 @@
+/*
+ * jdarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy decoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  INT32 c;       /* C register, base of coding interval + input bit buffer */
+  INT32 a;               /* A register, normalized size of coding interval */
+  int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
+                                                         /* init: ct = -16 */
+                                                         /* run: ct = 0..7 */
+                                                         /* error: ct = -1 */
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte (j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+  struct jpeg_source_mgr * src = cinfo->src;
+
+  if (src->bytes_in_buffer == 0)
+    if (! (*src->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  src->bytes_in_buffer--;
+  return GETJOCTET(*src->next_input_byte++);
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can get away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv, data;
+
+  /* Renormalization & data input per section D.2.6 */
+  while (e->a < 0x8000L) {
+    if (--e->ct < 0) {
+      /* Need to fetch next data byte */
+      if (cinfo->unread_marker)
+	data = 0;		/* stuff zero data */
+      else {
+	data = get_byte(cinfo);	/* read next input byte */
+	if (data == 0xFF) {	/* zero stuff or marker code */
+	  do data = get_byte(cinfo);
+	  while (data == 0xFF);	/* swallow extra 0xFF bytes */
+	  if (data == 0)
+	    data = 0xFF;	/* discard stuffed zero byte */
+	  else {
+	    /* Note: Different from the Huffman decoder, hitting
+	     * a marker while processing the compressed data
+	     * segment is legal in arithmetic coding.
+	     * The convention is to supply zero data
+	     * then until decoding is complete.
+	     */
+	    cinfo->unread_marker = data;
+	    data = 0;
+	  }
+	}
+      }
+      e->c = (e->c << 8) | data; /* insert data into C register */
+      if ((e->ct += 8) < 0)	 /* update bit shift counter */
+	/* Need more initial bytes */
+	if (++e->ct == 0)
+	  /* Got 2 initial bytes -> re-init A and exit loop */
+	  e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+    }
+    e->a <<= 1;
+  }
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+  temp = e->a - qe;
+  e->a = temp;
+  temp <<= e->ct;
+  if (e->c >= temp) {
+    e->c -= temp;
+    /* Conditional LPS (less probable symbol) exchange */
+    if (e->a < qe) {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    } else {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    }
+  } else if (e->a < 0x8000L) {
+    /* Conditional MPS (more probable symbol) exchange */
+    if (e->a < qe) {
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    } else {
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    }
+  }
+
+  return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  /* Advance past the RSTn marker */
+  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+	(cinfo->progressive_mode && cinfo->Ss)) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Reset restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+  /* Figure F.20: Decode_AC_coefficients */
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    while (arith_decode(cinfo, st + 1) == 0) {
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+    /* Figure F.21: Decoding nonzero value v */
+    /* Figure F.22: Decoding the sign of v */
+    sign = arith_decode(cinfo, entropy->fixed_bin);
+    st += 2;
+    /* Figure F.23: Decoding the magnitude category of v */
+    if ((m = arith_decode(cinfo, st)) != 0) {
+      if (arith_decode(cinfo, st)) {
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+    }
+    v = m;
+    /* Figure F.24: Decoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      if (arith_decode(cinfo, st)) v |= m;
+    v += 1; if (sign) v = -v;
+    /* Scale and output coefficient in natural (dezigzagged) order */
+    (*block)[natural_order[k]] = (JCOEF) (v << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int p1, blkn;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    if (arith_decode(cinfo, st))
+      MCU_data[blkn][0][0] |= p1;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  unsigned char *st;
+  int tbl, k, kex;
+  int p1, m1;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = cinfo->Se; kex > 0; kex--)
+    if ((*block)[natural_order[kex]]) break;
+
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+    for (;;) {
+      thiscoef = *block + natural_order[k];
+      if (*thiscoef) {				/* previously nonzero coef */
+	if (arith_decode(cinfo, st + 2)) {
+	  if (*thiscoef < 0)
+	    *thiscoef += m1;
+	  else
+	    *thiscoef += p1;
+	}
+	break;
+      }
+      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
+	if (arith_decode(cinfo, entropy->fixed_bin))
+	  *thiscoef = m1;
+	else
+	  *thiscoef = p1;
+	break;
+      }
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+
+    /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Figure F.20: Decode_AC_coefficients */
+    for (k = 1; k <= cinfo->lim_Se; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      while (arith_decode(cinfo, st + 1) == 0) {
+	st += 3; k++;
+	if (k > cinfo->lim_Se) {
+	  WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	  entropy->ct = -1;			/* spectral overflow */
+	  return TRUE;
+	}
+      }
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, entropy->fixed_bin);
+      st += 2;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	if (arith_decode(cinfo, st)) {
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (arith_decode(cinfo, st)) {
+	    if ((m <<= 1) == 0x8000) {
+	      WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	      entropy->ct = -1;			/* magnitude overflow */
+	      return TRUE;
+	    }
+	    st += 1;
+	  }
+	}
+      }
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      (*block)[natural_order[k]] = (JCOEF) v;
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > cinfo->lim_Se)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	(cinfo->Se < DCTSIZE2 && cinfo->Se != cinfo->lim_Se))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+  }
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+	(cinfo->progressive_mode && cinfo->Ss)) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+    }
+  }
+
+  /* Initialize arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_decoder));
+  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++) 
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+  }
+}
diff --git a/jpeg/jdatadst.c b/jpeg/jdatadst.c
index a8f6fb0e0..472d5f324 100644
--- a/jpeg/jdatadst.c
+++ b/jpeg/jdatadst.c
@@ -2,13 +2,14 @@
  * jdatadst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains compression data destination routines for the case of
- * emitting JPEG data to a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * destination manager.
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
  * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
  * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -19,6 +20,11 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
 
 /* Expanded data destination object for stdio output */
 
@@ -34,6 +40,21 @@ typedef my_destination_mgr * my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
 
 
+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char ** outbuffer;	/* target buffer */
+  unsigned long * outsize;
+  unsigned char * newbuffer;	/* newly allocated buffer */
+  JOCTET * buffer;		/* start of buffer */
+  size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr * my_mem_dest_ptr;
+
+
 /*
  * Initialize destination --- called by jpeg_start_compress
  * before any data is actually written.
@@ -53,6 +74,12 @@ init_destination (j_compress_ptr cinfo)
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }
 
+METHODDEF(void)
+init_mem_destination (j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
 
 /*
  * Empty the output buffer --- called whenever buffer fills up.
@@ -92,6 +119,36 @@ empty_output_buffer (j_compress_ptr cinfo)
   return TRUE;
 }
 
+METHODDEF(boolean)
+empty_mem_output_buffer (j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET * nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+
+  if (dest->newbuffer != NULL)
+    free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+
 
 /*
  * Terminate destination --- called by jpeg_finish_compress
@@ -119,6 +176,15 @@ term_destination (j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
+METHODDEF(void)
+term_mem_destination (j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+
 
 /*
  * Prepare for output to a stdio stream.
@@ -149,3 +215,53 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
   dest->pub.term_destination = term_destination;
   dest->outfile = outfile;
 }
+
+
+/*
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
+ */
+
+GLOBAL(void)
+jpeg_mem_dest (j_compress_ptr cinfo,
+	       unsigned char ** outbuffer, unsigned long * outsize)
+{
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+  /* The destination object is made permanent so that multiple JPEG images
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
+   */
+  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+    cinfo->dest = (struct jpeg_destination_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(my_mem_destination_mgr));
+  }
+
+  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->newbuffer = NULL;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    /* Allocate initial buffer */
+    dest->newbuffer = *outbuffer = malloc(OUTPUT_BUF_SIZE);
+    if (dest->newbuffer == NULL)
+      ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+    *outsize = OUTPUT_BUF_SIZE;
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  dest->pub.free_in_buffer = dest->bufsize = *outsize;
+}
diff --git a/jpeg/jdatasrc.c b/jpeg/jdatasrc.c
index edc752bf5..c8fe3daf3 100644
--- a/jpeg/jdatasrc.c
+++ b/jpeg/jdatasrc.c
@@ -2,13 +2,14 @@
  * jdatasrc.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains decompression data source routines for the case of
- * reading JPEG data from a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * source manager.
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
  * IMPORTANT: we assume that fread() will correctly transcribe an array of
  * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -52,6 +53,12 @@ init_source (j_decompress_ptr cinfo)
   src->start_of_file = TRUE;
 }
 
+METHODDEF(void)
+init_mem_source (j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
 
 /*
  * Fill the input buffer --- called whenever buffer is emptied.
@@ -111,6 +118,26 @@ fill_input_buffer (j_decompress_ptr cinfo)
   return TRUE;
 }
 
+METHODDEF(boolean)
+fill_mem_input_buffer (j_decompress_ptr cinfo)
+{
+  static JOCTET mybuffer[4];
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+  /* Insert a fake EOI marker */
+  mybuffer[0] = (JOCTET) 0xFF;
+  mybuffer[1] = (JOCTET) JPEG_EOI;
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+
 
 /*
  * Skip data --- used to skip over a potentially large amount of
@@ -127,22 +154,22 @@ fill_input_buffer (j_decompress_ptr cinfo)
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  struct jpeg_source_mgr * src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->pub.bytes_in_buffer) {
-      num_bytes -= (long) src->pub.bytes_in_buffer;
-      (void) fill_input_buffer(cinfo);
+    while (num_bytes > (long) src->bytes_in_buffer) {
+      num_bytes -= (long) src->bytes_in_buffer;
+      (void) (*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->pub.next_input_byte += (size_t) num_bytes;
-    src->pub.bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t) num_bytes;
+    src->bytes_in_buffer -= (size_t) num_bytes;
   }
 }
 
@@ -210,3 +237,38 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
   src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
   src->pub.next_input_byte = NULL; /* until buffer loaded */
 }
+
+
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src (j_decompress_ptr cinfo,
+	      unsigned char * inbuffer, unsigned long insize)
+{
+  struct jpeg_source_mgr * src;
+
+  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(struct jpeg_source_mgr));
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = (size_t) insize;
+  src->next_input_byte = (JOCTET *) inbuffer;
+}
diff --git a/jpeg/jdcoefct.c b/jpeg/jdcoefct.c
index 4938d20fc..462e92c61 100644
--- a/jpeg/jdcoefct.c
+++ b/jpeg/jdcoefct.c
@@ -187,7 +187,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
+	  yoffset * compptr->DCT_v_scaled_size;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +197,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      (*inverse_DCT) (cinfo, compptr,
 			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
 			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
+	      output_col += compptr->DCT_h_scaled_size;
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
+	  output_ptr += compptr->DCT_v_scaled_size;
 	}
       }
     }
@@ -362,9 +362,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
@@ -654,9 +654,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	DC4 = DC5; DC5 = DC6;
 	DC7 = DC8; DC8 = DC9;
 	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
diff --git a/jpeg/jdct.h b/jpeg/jdct.h
index 04192a266..360dec80c 100644
--- a/jpeg/jdct.h
+++ b/jpeg/jdct.h
@@ -14,11 +14,16 @@
 
 
 /*
- * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
- * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
- * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * A forward DCT routine is given a pointer to an input sample array and
+ * a pointer to a work area of type DCTELEM[]; the DCT is to be performed
+ * in-place in that buffer.  Type DCTELEM is int for 8-bit samples, INT32
+ * for 12-bit samples.  (NOTE: Floating-point DCT implementations use an
+ * array of type FAST_FLOAT, instead.)
+ * The input data is to be fetched from the sample array starting at a
+ * specified column.  (Any row offset needed will be applied to the array
+ * pointer before it is passed to the FDCT code.)
+ * Note that the number of samples fetched by the FDCT routine is
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
@@ -32,8 +37,12 @@ typedef int DCTELEM;		/* 16 or 32 bits is fine */
 typedef INT32 DCTELEM;		/* must have 32 bits */
 #endif
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data,
+					       JSAMPARRAY sample_data,
+					       JDIMENSION start_col));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data,
+					     JSAMPARRAY sample_data,
+					     JDIMENSION start_col));
 
 
 /*
@@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
  * sample array starting at a specified column.  (Any row offset needed will
  * be applied to the array pointer before it is passed to the IDCT code.)
  * Note that the number of samples emitted by the IDCT routine is
- * DCT_scaled_size * DCT_scaled_size.
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  */
 
 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */
@@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define jpeg_fdct_islow		jFDislow
 #define jpeg_fdct_ifast		jFDifast
 #define jpeg_fdct_float		jFDfloat
+#define jpeg_fdct_7x7		jFD7x7
+#define jpeg_fdct_6x6		jFD6x6
+#define jpeg_fdct_5x5		jFD5x5
+#define jpeg_fdct_4x4		jFD4x4
+#define jpeg_fdct_3x3		jFD3x3
+#define jpeg_fdct_2x2		jFD2x2
+#define jpeg_fdct_1x1		jFD1x1
+#define jpeg_fdct_9x9		jFD9x9
+#define jpeg_fdct_10x10		jFD10x10
+#define jpeg_fdct_11x11		jFD11x11
+#define jpeg_fdct_12x12		jFD12x12
+#define jpeg_fdct_13x13		jFD13x13
+#define jpeg_fdct_14x14		jFD14x14
+#define jpeg_fdct_15x15		jFD15x15
+#define jpeg_fdct_16x16		jFD16x16
+#define jpeg_fdct_16x8		jFD16x8
+#define jpeg_fdct_14x7		jFD14x7
+#define jpeg_fdct_12x6		jFD12x6
+#define jpeg_fdct_10x5		jFD10x5
+#define jpeg_fdct_8x4		jFD8x4
+#define jpeg_fdct_6x3		jFD6x3
+#define jpeg_fdct_4x2		jFD4x2
+#define jpeg_fdct_2x1		jFD2x1
+#define jpeg_fdct_8x16		jFD8x16
+#define jpeg_fdct_7x14		jFD7x14
+#define jpeg_fdct_6x12		jFD6x12
+#define jpeg_fdct_5x10		jFD5x10
+#define jpeg_fdct_4x8		jFD4x8
+#define jpeg_fdct_3x6		jFD3x6
+#define jpeg_fdct_2x4		jFD2x4
+#define jpeg_fdct_1x2		jFD1x2
 #define jpeg_idct_islow		jRDislow
 #define jpeg_idct_ifast		jRDifast
 #define jpeg_idct_float		jRDfloat
+#define jpeg_idct_7x7		jRD7x7
+#define jpeg_idct_6x6		jRD6x6
+#define jpeg_idct_5x5		jRD5x5
 #define jpeg_idct_4x4		jRD4x4
+#define jpeg_idct_3x3		jRD3x3
 #define jpeg_idct_2x2		jRD2x2
 #define jpeg_idct_1x1		jRD1x1
+#define jpeg_idct_9x9		jRD9x9
+#define jpeg_idct_10x10		jRD10x10
+#define jpeg_idct_11x11		jRD11x11
+#define jpeg_idct_12x12		jRD12x12
+#define jpeg_idct_13x13		jRD13x13
+#define jpeg_idct_14x14		jRD14x14
+#define jpeg_idct_15x15		jRD15x15
+#define jpeg_idct_16x16		jRD16x16
+#define jpeg_idct_16x8		jRD16x8
+#define jpeg_idct_14x7		jRD14x7
+#define jpeg_idct_12x6		jRD12x6
+#define jpeg_idct_10x5		jRD10x5
+#define jpeg_idct_8x4		jRD8x4
+#define jpeg_idct_6x3		jRD6x3
+#define jpeg_idct_4x2		jRD4x2
+#define jpeg_idct_2x1		jRD2x1
+#define jpeg_idct_8x16		jRD8x16
+#define jpeg_idct_7x14		jRD7x14
+#define jpeg_idct_6x12		jRD6x12
+#define jpeg_idct_5x10		jRD5x10
+#define jpeg_idct_4x8		jRD4x8
+#define jpeg_idct_3x6		jRD3x8
+#define jpeg_idct_2x4		jRD2x4
+#define jpeg_idct_1x2		jRD1x2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_ifast
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_float
+    JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_9x9
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_11x11
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_13x13
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_15x15
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
 
 EXTERN(void) jpeg_idct_islow
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
@@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast
 EXTERN(void) jpeg_idct_float
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_4x4
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_2x2
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_1x1
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_9x9
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_11x11
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_13x13
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_15x15
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x1
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 
 
 /*
diff --git a/jpeg/jddctmgr.c b/jpeg/jddctmgr.c
index bbf8d0e92..0ded9d574 100644
--- a/jpeg/jddctmgr.c
+++ b/jpeg/jddctmgr.c
@@ -2,6 +2,7 @@
  * jddctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2002-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -98,22 +99,134 @@ start_pass (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
-    case 1:
+    case ((1 << 8) + 1):
       method_ptr = jpeg_idct_1x1;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 2:
+    case ((2 << 8) + 2):
       method_ptr = jpeg_idct_2x2;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 4:
+    case ((3 << 8) + 3):
+      method_ptr = jpeg_idct_3x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
       method_ptr = jpeg_idct_4x4;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      method_ptr = jpeg_idct_5x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      method_ptr = jpeg_idct_6x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      method_ptr = jpeg_idct_7x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      method_ptr = jpeg_idct_9x9;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      method_ptr = jpeg_idct_10x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      method_ptr = jpeg_idct_11x11;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      method_ptr = jpeg_idct_12x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      method_ptr = jpeg_idct_13x13;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      method_ptr = jpeg_idct_14x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      method_ptr = jpeg_idct_15x15;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      method_ptr = jpeg_idct_16x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      method_ptr = jpeg_idct_16x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      method_ptr = jpeg_idct_14x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      method_ptr = jpeg_idct_12x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      method_ptr = jpeg_idct_10x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      method_ptr = jpeg_idct_8x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      method_ptr = jpeg_idct_6x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      method_ptr = jpeg_idct_4x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      method_ptr = jpeg_idct_2x1;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      method_ptr = jpeg_idct_8x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      method_ptr = jpeg_idct_7x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      method_ptr = jpeg_idct_6x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      method_ptr = jpeg_idct_5x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      method_ptr = jpeg_idct_4x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      method_ptr = jpeg_idct_3x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      method_ptr = jpeg_idct_2x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      method_ptr = jpeg_idct_1x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
 #endif
-    case DCTSIZE:
+    case ((DCTSIZE << 8) + DCTSIZE):
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
@@ -139,7 +252,8 @@ start_pass (j_decompress_ptr cinfo)
       }
       break;
     default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
       break;
     }
     idct->pub.inverse_DCT[ci] = method_ptr;
@@ -211,6 +325,7 @@ start_pass (j_decompress_ptr cinfo)
 	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
 	 *   scalefactor[0] = 1
 	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+	 * We apply a further scale factor of 1/8.
 	 */
 	FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
 	int row, col;
@@ -224,7 +339,7 @@ start_pass (j_decompress_ptr cinfo)
 	  for (col = 0; col < DCTSIZE; col++) {
 	    fmtbl[i] = (FLOAT_MULT_TYPE)
 	      ((double) qtbl->quantval[i] *
-	       aanscalefactor[row] * aanscalefactor[col]);
+	       aanscalefactor[row] * aanscalefactor[col] * 0.125);
 	    i++;
 	  }
 	}
diff --git a/jpeg/jdhuff.c b/jpeg/jdhuff.c
index b5ba39f73..06f92fe47 100644
--- a/jpeg/jdhuff.c
+++ b/jpeg/jdhuff.c
@@ -2,10 +2,12 @@
  * jdhuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy decoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting input suspension.
  * If the data source module demands suspension, we want to be able to back
@@ -17,7 +19,173 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdphuff.c */
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+
+typedef struct {
+  /* Basic tables: (element [0] of each array is unused) */
+  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
+  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
+  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+   * the smallest code of length k; so given a code of length k, the
+   * corresponding symbol is huffval[code + valoffset[k]]
+   */
+
+  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+  JHUFF_TBL *pub;
+
+  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+   * the input data stream.  If the next Huffman code is no more
+   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+   * the corresponding symbol directly from these tables.
+   */
+  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
+  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+} d_derived_tbl;
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders.  We implement it with a combination of inline
+ * macros and out-of-line subroutines.  Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  32	/* size of buffer in bits */
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win.  Unfortunately we can't define the size
+ * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct {		/* Bitreading state saved across MCUs */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct {		/* Bitreading working state within an MCU */
+  /* Current data source location */
+  /* We need a copy, rather than munging the original, in case of suspension */
+  const JOCTET * next_input_byte; /* => next byte to read from source */
+  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
+  /* Bit input buffer --- note these values are kept in register variables,
+   * not in this struct, inside the inner loops.
+   */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+  /* Pointer needed by jpeg_fill_bit_buffer. */
+  j_decompress_ptr cinfo;	/* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/save bitread local variables. */
+#define BITREAD_STATE_VARS  \
+	register bit_buf_type get_buffer;  \
+	register int bits_left;  \
+	bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop,permstate)  \
+	br_state.cinfo = cinfop; \
+	br_state.next_input_byte = cinfop->src->next_input_byte; \
+	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+	get_buffer = permstate.get_buffer; \
+	bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop,permstate)  \
+	cinfop->src->next_input_byte = br_state.next_input_byte; \
+	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+	permstate.get_buffer = get_buffer; \
+	permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ *	CHECK_BIT_BUFFER(state,n,action);
+ *		Ensure there are N bits in get_buffer; if suspend, take action.
+ *      val = GET_BITS(n);
+ *		Fetch next N bits.
+ *      val = PEEK_BITS(n);
+ *		Fetch next N bits without removing them from the buffer.
+ *	DROP_BITS(n);
+ *		Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state,nbits,action) \
+	{ if (bits_left < (nbits)) {  \
+	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
+	      { action; }  \
+	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+
+#define GET_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -= (nbits)))) & BIT_MASK(nbits))
+
+#define PEEK_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -  (nbits)))) & BIT_MASK(nbits))
+
+#define DROP_BITS(nbits) \
+	(bits_left -= (nbits))
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping.  Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long.  The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ *    for a lookahead.  In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ *    more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
+{ register int nb, look; \
+  if (bits_left < HUFF_LOOKAHEAD) { \
+    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (bits_left < HUFF_LOOKAHEAD) { \
+      nb = 1; goto slowlabel; \
+    } \
+  } \
+  look = PEEK_BITS(HUFF_LOOKAHEAD); \
+  if ((nb = htbl->look_nbits[look]) != 0) { \
+    DROP_BITS(nb); \
+    result = htbl->look_sym[look]; \
+  } else { \
+    nb = HUFF_LOOKAHEAD+1; \
+slowlabel: \
+    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
+	{ failaction; } \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+  } \
+}
 
 
 /*
@@ -28,7 +196,8 @@
  */
 
 typedef struct {
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
+  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
 } savable_state;
 
 /* This macro is to work around compilers with missing or broken
@@ -41,7 +210,8 @@ typedef struct {
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).last_dc_val[0] = (src).last_dc_val[0], \
+	((dest).EOBRUN = (src).EOBRUN, \
+	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
 	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
 	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
 	 (dest).last_dc_val[3] = (src).last_dc_val[3])
@@ -59,8 +229,18 @@ typedef struct {
   savable_state saved;		/* Other state at start of MCU */
 
   /* These fields are NOT loaded into local working state. */
+  boolean insufficient_data;	/* set TRUE after emitting warning */
   unsigned int restarts_to_go;	/* MCUs left in this restart interval */
 
+  /* Following two fields used only in progressive mode */
+
+  /* Pointers to derived tables (these workspaces have image lifespan) */
+  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+
+  /* Following fields used only in sequential mode */
+
   /* Pointers to derived tables (these workspaces have image lifespan) */
   d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
@@ -71,81 +251,75 @@ typedef struct {
   d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   /* Whether we care about the DC and AC coefficient values for each block */
-  boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
-  boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
+  int coef_limit[D_MAX_BLOCKS_IN_MCU];
 } huff_entropy_decoder;
 
 typedef huff_entropy_decoder * huff_entropy_ptr;
 
 
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, blkn, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
-   * This ought to be an error condition, but we make it a warning because
-   * there are some baseline files out there with all zeroes in these bytes.
-   */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
-      cinfo->Ah != 0 || cinfo->Al != 0)
-    WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    /* Compute derived values for Huffman tables */
-    /* We may do this more than once for a table, but it's not expensive */
-    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
-			    & entropy->dc_derived_tbls[dctbl]);
-    jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
-			    & entropy->ac_derived_tbls[actbl]);
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Precalculate decoding info for each block in an MCU of this scan */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-    /* Precalculate which table to use for each block */
-    entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
-    entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
-    /* Decide whether we really care about the coefficient values */
-    if (compptr->component_needed) {
-      entropy->dc_needed[blkn] = TRUE;
-      /* we don't need the ACs if producing a 1/8th-size image */
-      entropy->ac_needed[blkn] = (compptr->DCT_scaled_size > 1);
-    } else {
-      entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE;
-    }
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
+static const int jpeg_zigzag_order[8][8] = {
+  {  0,  1,  5,  6, 14, 15, 27, 28 },
+  {  2,  4,  7, 13, 16, 26, 29, 42 },
+  {  3,  8, 12, 17, 25, 30, 41, 43 },
+  {  9, 11, 18, 24, 31, 40, 44, 53 },
+  { 10, 19, 23, 32, 39, 45, 52, 54 },
+  { 20, 22, 33, 38, 46, 51, 55, 60 },
+  { 21, 34, 37, 47, 50, 56, 59, 61 },
+  { 35, 36, 48, 49, 57, 58, 62, 63 }
+};
+
+static const int jpeg_zigzag_order7[7][7] = {
+  {  0,  1,  5,  6, 14, 15, 27 },
+  {  2,  4,  7, 13, 16, 26, 28 },
+  {  3,  8, 12, 17, 25, 29, 38 },
+  {  9, 11, 18, 24, 30, 37, 39 },
+  { 10, 19, 23, 31, 36, 40, 45 },
+  { 20, 22, 32, 35, 41, 44, 46 },
+  { 21, 33, 34, 42, 43, 47, 48 }
+};
+
+static const int jpeg_zigzag_order6[6][6] = {
+  {  0,  1,  5,  6, 14, 15 },
+  {  2,  4,  7, 13, 16, 25 },
+  {  3,  8, 12, 17, 24, 26 },
+  {  9, 11, 18, 23, 27, 32 },
+  { 10, 19, 22, 28, 31, 33 },
+  { 20, 21, 29, 30, 34, 35 }
+};
+
+static const int jpeg_zigzag_order5[5][5] = {
+  {  0,  1,  5,  6, 14 },
+  {  2,  4,  7, 13, 15 },
+  {  3,  8, 12, 16, 21 },
+  {  9, 11, 17, 20, 22 },
+  { 10, 18, 19, 23, 24 }
+};
+
+static const int jpeg_zigzag_order4[4][4] = {
+  { 0,  1,  5,  6 },
+  { 2,  4,  7, 12 },
+  { 3,  8, 11, 13 },
+  { 9, 10, 14, 15 }
+};
+
+static const int jpeg_zigzag_order3[3][3] = {
+  { 0, 1, 5 },
+  { 2, 4, 6 },
+  { 3, 7, 8 }
+};
+
+static const int jpeg_zigzag_order2[2][2] = {
+  { 0, 1 },
+  { 2, 3 }
+};
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jdphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 			 d_derived_tbl ** pdtbl)
 {
@@ -267,8 +441,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 
 
 /*
- * Out-of-line code for bit fetching (shared with jdphuff.c).
- * See jdhuff.h for info about usage.
+ * Out-of-line code for bit fetching.
  * Note: current values of get_buffer and bits_left are passed as parameters,
  * but are returned in the corresponding fields of the state struct.
  *
@@ -288,7 +461,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 #endif
 
 
-GLOBAL(boolean)
+LOCAL(boolean)
 jpeg_fill_bit_buffer (bitread_working_state * state,
 		      register bit_buf_type get_buffer, register int bits_left,
 		      int nbits)
@@ -369,9 +542,9 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
        * We use a nonvolatile flag to ensure that only one warning message
        * appears per data segment.
        */
-      if (! cinfo->entropy->insufficient_data) {
+      if (! ((huff_entropy_ptr) cinfo->entropy)->insufficient_data) {
 	WARNMS(cinfo, JWRN_HIT_MARKER);
-	cinfo->entropy->insufficient_data = TRUE;
+	((huff_entropy_ptr) cinfo->entropy)->insufficient_data = TRUE;
       }
       /* Fill the buffer with zero bits */
       get_buffer <<= MIN_GET_BITS - bits_left;
@@ -389,12 +562,33 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
 }
 
 
+/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and sub will be faster than a table lookup.
+ */
+
+#ifdef AVOID_TABLES
+
+#define BIT_MASK(nbits)   ((1<<(nbits))-1)
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) - ((1<<(s))-1) : (x))
+
+#else
+
+#define BIT_MASK(nbits)   bmask[nbits]
+#define HUFF_EXTEND(x,s)  ((x) <= bmask[(s) - 1] ? (x) - bmask[s] : (x))
+
+static const int bmask[16] =	/* bmask[n] is mask for n rightmost bits */
+  { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
+    0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
+
+#endif /* AVOID_TABLES */
+
+
 /*
  * Out-of-line code for Huffman code decoding.
- * See jdhuff.h for info about usage.
  */
 
-GLOBAL(int)
+LOCAL(int)
 jpeg_huff_decode (bitread_working_state * state,
 		  register bit_buf_type get_buffer, register int bits_left,
 		  d_derived_tbl * htbl, int min_bits)
@@ -433,32 +627,6 @@ jpeg_huff_decode (bitread_working_state * state,
 }
 
 
-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
 /*
  * Check for a restart marker & resynchronize decoder.
  * Returns FALSE if must suspend.
@@ -482,6 +650,8 @@ process_restart (j_decompress_ptr cinfo)
   /* Re-initialize DC predictions to 0 */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++)
     entropy->saved.last_dc_val[ci] = 0;
+  /* Re-init EOB run count, too */
+  entropy->saved.EOBRUN = 0;
 
   /* Reset restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -492,34 +662,47 @@ process_restart (j_decompress_ptr cinfo)
    * leaving the flag set.
    */
   if (cinfo->unread_marker == 0)
-    entropy->pub.insufficient_data = FALSE;
+    entropy->insufficient_data = FALSE;
 
   return TRUE;
 }
 
 
 /*
- * Decode and return one MCU's worth of Huffman-compressed coefficients.
+ * Huffman MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * Huffman-compressed coefficients. 
  * The coefficients are reordered from zigzag order into natural array order,
  * but are not dequantized.
  *
  * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
  * (Wholesale zeroing is usually a little faster than retail...)
  *
- * Returns FALSE if data source requested suspension.  In that case no
+ * We return FALSE if data source requested suspension.  In that case no
  * changes have been made to permanent state.  (Exception: some output
  * coefficients may already have been assigned.  This is harmless for
- * this module, since we'll just re-assign them on the next call.)
+ * spectral selection, since we'll just re-assign them on the next call.
+ * Successive approximation AC refinement has to be more careful, however.)
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int blkn;
+  int Al = cinfo->Al;
+  register int s, r;
+  int blkn, ci;
+  JBLOCKROW block;
   BITREAD_STATE_VARS;
   savable_state state;
+  d_derived_tbl * tbl;
+  jpeg_component_info * compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -531,7 +714,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (! entropy->insufficient_data) {
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
@@ -540,79 +723,571 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Outer loop handles each block in the MCU */
 
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      JBLOCKROW block = MCU_data[blkn];
-      d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-      d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
-      register int s, k, r;
+      block = MCU_data[blkn];
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
 
       /* Decode a single block's worth of coefficients */
 
       /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
       if (s) {
 	CHECK_BIT_BUFFER(br_state, s, return FALSE);
 	r = GET_BITS(s);
 	s = HUFF_EXTEND(r, s);
       }
 
-      if (entropy->dc_needed[blkn]) {
+      /* Convert DC difference to actual value, update last_dc_val */
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
+      (*block)[0] = (JCOEF) (s << Al);
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(entropy->saved, state);
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int s, k, r;
+  unsigned int EOBRUN;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    Se = cinfo->Se;
+    Al = cinfo->Al;
+    natural_order = cinfo->natural_order;
+
+    /* Load up working state.
+     * We can avoid loading/saving bitread state if in an EOB run.
+     */
+    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+
+    if (EOBRUN > 0)		/* if it's a band of zeroes... */
+      EOBRUN--;			/* ...process it now (we do nothing) */
+    else {
+      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      block = MCU_data[0];
+      tbl = entropy->ac_derived_tbl;
+
+      for (k = cinfo->Ss; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	  /* Scale and output coefficient in natural (dezigzagged) order */
+	  (*block)[natural_order[k]] = (JCOEF) (s << Al);
+	} else {
+	  if (r == 15) {	/* ZRL */
+	    k += 15;		/* skip 15 zeroes in band */
+	  } else {		/* EOBr, run length is 2^r + appended bits */
+	    EOBRUN = 1 << r;
+	    if (r) {		/* EOBr, r > 0 */
+	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    EOBRUN--;		/* this band is processed at this moment */
+	    break;		/* force end-of-band */
+	  }
+	}
+      }
+
+      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    }
+
+    /* Completed MCU, so update state */
+    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+  int blkn;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* Not worth the cycles to check insufficient_data here,
+   * since we will not change the data anyway if we read zeroes.
+   */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
+    if (GET_BITS(1))
+      (*block)[0] |= p1;
+    /* Note: since we use |=, repeating the assignment later is safe */
+  }
+
+  /* Completed MCU, so update state */
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int s, k, r;
+  unsigned int EOBRUN;
+  int Se, p1, m1;
+  const int * natural_order;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+  int num_newnz;
+  int newnz_pos[DCTSIZE2];
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, don't modify the MCU.
+   */
+  if (! entropy->insufficient_data) {
+
+    Se = cinfo->Se;
+    p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+    m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+    natural_order = cinfo->natural_order;
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+    block = MCU_data[0];
+    tbl = entropy->ac_derived_tbl;
+
+    /* If we are forced to suspend, we must undo the assignments to any newly
+     * nonzero coefficients in the block, because otherwise we'd get confused
+     * next time about which coefficients were already nonzero.
+     * But we need not undo addition of bits to already-nonzero coefficients;
+     * instead, we can test the current bit to see if we already did it.
+     */
+    num_newnz = 0;
+
+    /* initialize coefficient loop counter to start of band */
+    k = cinfo->Ss;
+
+    if (EOBRUN == 0) {
+      for (; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  if (s != 1)		/* size of new coef should always be 1 */
+	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1))
+	    s = p1;		/* newly nonzero coef is positive */
+	  else
+	    s = m1;		/* newly nonzero coef is negative */
+	} else {
+	  if (r != 15) {
+	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
+	    if (r) {
+	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    break;		/* rest of block is handled by EOB logic */
+	  }
+	  /* note s = 0 for processing ZRL */
+	}
+	/* Advance over already-nonzero coefs and r still-zero coefs,
+	 * appending correction bits to the nonzeroes.  A correction bit is 1
+	 * if the absolute value of the coefficient must be increased.
+	 */
+	do {
+	  thiscoef = *block + natural_order[k];
+	  if (*thiscoef != 0) {
+	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	    if (GET_BITS(1)) {
+	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+		if (*thiscoef >= 0)
+		  *thiscoef += p1;
+		else
+		  *thiscoef += m1;
+	      }
+	    }
+	  } else {
+	    if (--r < 0)
+	      break;		/* reached target zero coefficient */
+	  }
+	  k++;
+	} while (k <= Se);
+	if (s) {
+	  int pos = natural_order[k];
+	  /* Output newly nonzero coefficient */
+	  (*block)[pos] = (JCOEF) s;
+	  /* Remember its position in case we have to suspend */
+	  newnz_pos[num_newnz++] = pos;
+	}
+      }
+    }
+
+    if (EOBRUN > 0) {
+      /* Scan any remaining coefficient positions after the end-of-band
+       * (the last newly nonzero coefficient, if any).  Append a correction
+       * bit to each already-nonzero coefficient.  A correction bit is 1
+       * if the absolute value of the coefficient must be increased.
+       */
+      for (; k <= Se; k++) {
+	thiscoef = *block + natural_order[k];
+	if (*thiscoef != 0) {
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1)) {
+	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+	      if (*thiscoef >= 0)
+		*thiscoef += p1;
+	      else
+		*thiscoef += m1;
+	    }
+	  }
+	}
+      }
+      /* Count one block completed in EOB run */
+      EOBRUN--;
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+
+undoit:
+  /* Re-zero any output coefficients that we made newly nonzero */
+  while (num_newnz > 0)
+    (*block)[newnz_pos[--num_newnz]] = 0;
+
+  return FALSE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients,
+ * partial blocks.
+ */
+
+METHODDEF(boolean)
+decode_mcu_sub (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  const int * natural_order;
+  int Se, blkn;
+  BITREAD_STATE_VARS;
+  savable_state state;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    natural_order = cinfo->natural_order;
+    Se = cinfo->lim_Se;
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(state, entropy->saved);
+
+    /* Outer loop handles each block in the MCU */
+
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      JBLOCKROW block = MCU_data[blkn];
+      d_derived_tbl * htbl;
+      register int s, k, r;
+      int coef_limit, ci;
+
+      /* Decode a single block's worth of coefficients */
+
+      /* Section F.2.2.1: decode the DC coefficient difference */
+      htbl = entropy->dc_cur_tbls[blkn];
+      HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+      htbl = entropy->ac_cur_tbls[blkn];
+      k = 1;
+      coef_limit = entropy->coef_limit[blkn];
+      if (coef_limit) {
 	/* Convert DC difference to actual value, update last_dc_val */
-	int ci = cinfo->MCU_membership[blkn];
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	}
+	ci = cinfo->MCU_membership[blkn];
 	s += state.last_dc_val[ci];
 	state.last_dc_val[ci] = s;
-	/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+	/* Output the DC coefficient */
 	(*block)[0] = (JCOEF) s;
-      }
-
-      if (entropy->ac_needed[blkn]) {
 
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* Since zeroes are skipped, output area must be cleared beforehand */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
+	for (; k < coef_limit; k++) {
+	  HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
 	  r = s >> 4;
 	  s &= 15;
-      
+
 	  if (s) {
 	    k += r;
 	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
 	    r = GET_BITS(s);
 	    s = HUFF_EXTEND(r, s);
 	    /* Output coefficient in natural (dezigzagged) order.
-	     * Note: the extra entries in jpeg_natural_order[] will save us
-	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
+	     * Note: the extra entries in natural_order[] will save us
+	     * if k > Se, which could happen if the data is corrupted.
 	     */
-	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+	    (*block)[natural_order[k]] = (JCOEF) s;
 	  } else {
 	    if (r != 15)
-	      break;
+	      goto EndOfBlock;
 	    k += 15;
 	  }
 	}
-
       } else {
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	}
+      }
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
+
+	r = s >> 4;
+	s &= 15;
+
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	} else {
+	  if (r != 15)
+	    break;
+	  k += 15;
+	}
+      }
+
+      EndOfBlock: ;
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(entropy->saved, state);
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients,
+ * full-size blocks.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int blkn;
+  BITREAD_STATE_VARS;
+  savable_state state;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(state, entropy->saved);
+
+    /* Outer loop handles each block in the MCU */
+
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      JBLOCKROW block = MCU_data[blkn];
+      d_derived_tbl * htbl;
+      register int s, k, r;
+      int coef_limit, ci;
+
+      /* Decode a single block's worth of coefficients */
+
+      /* Section F.2.2.1: decode the DC coefficient difference */
+      htbl = entropy->dc_cur_tbls[blkn];
+      HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+      htbl = entropy->ac_cur_tbls[blkn];
+      k = 1;
+      coef_limit = entropy->coef_limit[blkn];
+      if (coef_limit) {
+	/* Convert DC difference to actual value, update last_dc_val */
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	}
+	ci = cinfo->MCU_membership[blkn];
+	s += state.last_dc_val[ci];
+	state.last_dc_val[ci] = s;
+	/* Output the DC coefficient */
+	(*block)[0] = (JCOEF) s;
 
 	/* Section F.2.2.2: decode the AC coefficients */
-	/* In this path we just discard the values */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
+	/* Since zeroes are skipped, output area must be cleared beforehand */
+	for (; k < coef_limit; k++) {
+	  HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
 	  r = s >> 4;
 	  s &= 15;
-      
+
 	  if (s) {
 	    k += r;
 	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
+	    r = GET_BITS(s);
+	    s = HUFF_EXTEND(r, s);
+	    /* Output coefficient in natural (dezigzagged) order.
+	     * Note: the extra entries in jpeg_natural_order[] will save us
+	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
+	     */
+	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
 	  } else {
 	    if (r != 15)
-	      break;
+	      goto EndOfBlock;
 	    k += 15;
 	  }
 	}
+      } else {
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	}
+      }
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (; k < DCTSIZE2; k++) {
+	HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
 
+	r = s >> 4;
+	s &= 15;
+
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	} else {
+	  if (r != 15)
+	    break;
+	  k += 15;
+	}
       }
+
+      EndOfBlock: ;
     }
 
     /* Completed MCU, so update state */
@@ -627,6 +1302,205 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 }
 
 
+/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_huff_decoder (j_decompress_ptr cinfo)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, blkn, tbl, i;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > cinfo->lim_Se)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
+       * but the spec doesn't say so, and we try to be liberal about what we
+       * accept.  Note: large Al values could result in out-of-range DC
+       * coefficients during early scans, leading to bizarre displays due to
+       * overflows in the IDCT math.  But we won't crash.
+       */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Make sure requested tables are present, and compute derived tables.
+       * We may build same derived table more than once, but it's not expensive.
+       */
+      if (cinfo->Ss == 0) {
+	if (cinfo->Ah == 0) {	/* DC refinement needs no table */
+	  tbl = compptr->dc_tbl_no;
+	  jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
+				  & entropy->derived_tbls[tbl]);
+	}
+      } else {
+	tbl = compptr->ac_tbl_no;
+	jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->derived_tbls[tbl]);
+	/* remember the single active table */
+	entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Initialize private state variables */
+    entropy->saved.EOBRUN = 0;
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning because
+     * there are some baseline files out there with all zeroes in these bytes.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	((cinfo->is_baseline || cinfo->Se < DCTSIZE2) &&
+	cinfo->Se != cinfo->lim_Se))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+
+    /* Select MCU decoding routine */
+    /* We retain the hard-coded case for full-size blocks.
+     * This is not necessary, but it appears that this version is slightly
+     * more performant in the given implementation.
+     * With an improved implementation we would prefer a single optimized
+     * function.
+     */
+    if (cinfo->lim_Se != DCTSIZE2-1)
+      entropy->pub.decode_mcu = decode_mcu_sub;
+    else
+      entropy->pub.decode_mcu = decode_mcu;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Compute derived values for Huffman tables */
+      /* We may do this more than once for a table, but it's not expensive */
+      tbl = compptr->dc_tbl_no;
+      jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
+			      & entropy->dc_derived_tbls[tbl]);
+      if (cinfo->lim_Se) {	/* AC needs no table when not present */
+	tbl = compptr->ac_tbl_no;
+	jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->ac_derived_tbls[tbl]);
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Precalculate decoding info for each block in an MCU of this scan */
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      /* Precalculate which table to use for each block */
+      entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
+      entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
+      /* Decide whether we really care about the coefficient values */
+      if (compptr->component_needed) {
+	ci = compptr->DCT_v_scaled_size;
+	i = compptr->DCT_h_scaled_size;
+	switch (cinfo->lim_Se) {
+	case (1*1-1):
+	  entropy->coef_limit[blkn] = 1;
+	  break;
+	case (2*2-1):
+	  if (ci <= 0 || ci > 2) ci = 2;
+	  if (i <= 0 || i > 2) i = 2;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order2[ci - 1][i - 1];
+	  break;
+	case (3*3-1):
+	  if (ci <= 0 || ci > 3) ci = 3;
+	  if (i <= 0 || i > 3) i = 3;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order3[ci - 1][i - 1];
+	  break;
+	case (4*4-1):
+	  if (ci <= 0 || ci > 4) ci = 4;
+	  if (i <= 0 || i > 4) i = 4;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order4[ci - 1][i - 1];
+	  break;
+	case (5*5-1):
+	  if (ci <= 0 || ci > 5) ci = 5;
+	  if (i <= 0 || i > 5) i = 5;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order5[ci - 1][i - 1];
+	  break;
+	case (6*6-1):
+	  if (ci <= 0 || ci > 6) ci = 6;
+	  if (i <= 0 || i > 6) i = 6;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order6[ci - 1][i - 1];
+	  break;
+	case (7*7-1):
+	  if (ci <= 0 || ci > 7) ci = 7;
+	  if (i <= 0 || i > 7) i = 7;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order7[ci - 1][i - 1];
+	  break;
+	default:
+	  if (ci <= 0 || ci > 8) ci = 8;
+	  if (i <= 0 || i > 8) i = 8;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order[ci - 1][i - 1];
+	  break;
+	}
+      } else {
+	entropy->coef_limit[blkn] = 0;
+      }
+    }
+  }
+
+  /* Initialize bitread state variables */
+  entropy->bitstate.bits_left = 0;
+  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+  entropy->insufficient_data = FALSE;
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
 /*
  * Module initialization routine for Huffman entropy decoding.
  */
@@ -642,10 +1516,26 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
 				SIZEOF(huff_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
-  entropy->pub.decode_mcu = decode_mcu;
 
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+
+    /* Mark derived tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->derived_tbls[i] = NULL;
+    }
+  } else {
+    /* Mark tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+    }
   }
 }
diff --git a/jpeg/jdhuff.h b/jpeg/jdhuff.h
deleted file mode 100644
index ae19b6caf..000000000
--- a/jpeg/jdhuff.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * jdhuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy decoding routines
- * that are shared between the sequential decoder (jdhuff.c) and the
- * progressive decoder (jdphuff.c).  No other modules need to see these.
- */
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_d_derived_tbl	jMkDDerived
-#define jpeg_fill_bit_buffer	jFilBitBuf
-#define jpeg_huff_decode	jHufDecode
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
-/* Derived data constructed for each Huffman table */
-
-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
-
-typedef struct {
-  /* Basic tables: (element [0] of each array is unused) */
-  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
-  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
-  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
-  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
-   * the smallest code of length k; so given a code of length k, the
-   * corresponding symbol is huffval[code + valoffset[k]]
-   */
-
-  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
-  JHUFF_TBL *pub;
-
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
-   * the input data stream.  If the next Huffman code is no more
-   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
-   */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
-} d_derived_tbl;
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-	JPP((j_decompress_ptr cinfo, boolean isDC, int tblno,
-	     d_derived_tbl ** pdtbl));
-
-
-/*
- * Fetching the next N bits from the input stream is a time-critical operation
- * for the Huffman decoders.  We implement it with a combination of inline
- * macros and out-of-line subroutines.  Note that N (the number of bits
- * demanded at one time) never exceeds 15 for JPEG use.
- *
- * We read source bytes into get_buffer and dole out bits as needed.
- * If get_buffer already contains enough bits, they are fetched in-line
- * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
- * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
- * as full as possible (not just to the number of bits needed; this
- * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
- * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
- * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
- * at least the requested number of bits --- dummy zeroes are inserted if
- * necessary.
- */
-
-typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32	/* size of buffer in bits */
-
-/* If long is > 32 bits on your machine, and shifting/masking longs is
- * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
- * appropriately should be a win.  Unfortunately we can't define the size
- * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
- * because not all machines measure sizeof in 8-bit bytes.
- */
-
-typedef struct {		/* Bitreading state saved across MCUs */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-} bitread_perm_state;
-
-typedef struct {		/* Bitreading working state within an MCU */
-  /* Current data source location */
-  /* We need a copy, rather than munging the original, in case of suspension */
-  const JOCTET * next_input_byte; /* => next byte to read from source */
-  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
-  /* Bit input buffer --- note these values are kept in register variables,
-   * not in this struct, inside the inner loops.
-   */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-  /* Pointer needed by jpeg_fill_bit_buffer. */
-  j_decompress_ptr cinfo;	/* back link to decompress master record */
-} bitread_working_state;
-
-/* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-	register bit_buf_type get_buffer;  \
-	register int bits_left;  \
-	bitread_working_state br_state
-
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-	br_state.cinfo = cinfop; \
-	br_state.next_input_byte = cinfop->src->next_input_byte; \
-	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-	get_buffer = permstate.get_buffer; \
-	bits_left = permstate.bits_left;
-
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-	cinfop->src->next_input_byte = br_state.next_input_byte; \
-	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-	permstate.get_buffer = get_buffer; \
-	permstate.bits_left = bits_left
-
-/*
- * These macros provide the in-line portion of bit fetching.
- * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
- * before using GET_BITS, PEEK_BITS, or DROP_BITS.
- * The variables get_buffer and bits_left are assumed to be locals,
- * but the state struct might not be (jpeg_huff_decode needs this).
- *	CHECK_BIT_BUFFER(state,n,action);
- *		Ensure there are N bits in get_buffer; if suspend, take action.
- *      val = GET_BITS(n);
- *		Fetch next N bits.
- *      val = PEEK_BITS(n);
- *		Fetch next N bits without removing them from the buffer.
- *	DROP_BITS(n);
- *		Discard next N bits.
- * The value N should be a simple variable, not an expression, because it
- * is evaluated multiple times.
- */
-
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-	{ if (bits_left < (nbits)) {  \
-	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-	      { action; }  \
-	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
-
-#define GET_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
-
-#define PEEK_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
-
-#define DROP_BITS(nbits) \
-	(bits_left -= (nbits))
-
-/* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, int nbits));
-
-
-/*
- * Code for extracting next Huffman-coded symbol from input bit stream.
- * Again, this is time-critical and we make the main paths be macros.
- *
- * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
- * without looping.  Usually, more than 95% of the Huffman codes will be 8
- * or fewer bits long.  The few overlength codes are handled with a loop,
- * which need not be inline code.
- *
- * Notes about the HUFF_DECODE macro:
- * 1. Near the end of the data segment, we may fail to get enough bits
- *    for a lookahead.  In that case, we do it the hard way.
- * 2. If the lookahead table contains no entry, the next code must be
- *    more than HUFF_LOOKAHEAD bits long.
- * 3. jpeg_huff_decode returns -1 if forced to suspend.
- */
-
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
-  if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-    if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
-    } \
-  } \
-  look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
-    DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
-  } else { \
-    nb = HUFF_LOOKAHEAD+1; \
-slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-	{ failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-  } \
-}
-
-/* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, d_derived_tbl * htbl, int min_bits));
diff --git a/jpeg/jdinput.c b/jpeg/jdinput.c
index 0c2ac8f12..2c5c717b9 100644
--- a/jpeg/jdinput.c
+++ b/jpeg/jdinput.c
@@ -2,13 +2,14 @@
  * jdinput.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
  * processing (marker reading and coefficient decoding).  The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * reading is done in jdmarker.c, jdhuff.c, and jdarith.c.
  */
 
 #define JPEG_INTERNALS
@@ -21,7 +22,7 @@
 typedef struct {
   struct jpeg_input_controller pub; /* public fields */
 
-  boolean inheaders;		/* TRUE until first SOS is reached */
+  int inheaders;		/* Nonzero until first SOS is reached */
 } my_input_controller;
 
 typedef my_input_controller * my_inputctl_ptr;
@@ -35,6 +36,174 @@ METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
  * Routines to calculate various quantities related to the size of the image.
  */
 
+
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+  int ci;
+  jpeg_component_info *compptr;
+
+  /* Compute actual output image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom) {
+    /* Provide 1/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 2) {
+    /* Provide 2/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 3) {
+    /* Provide 3/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 4) {
+    /* Provide 4/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 5) {
+    /* Provide 5/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 5L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 5L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 6) {
+    /* Provide 6/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 6L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 6L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 7) {
+    /* Provide 7/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 7L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 7L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 8) {
+    /* Provide 8/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 9) {
+    /* Provide 9/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 9L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 9L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 10) {
+    /* Provide 10/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 10L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 10L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 11) {
+    /* Provide 11/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 11L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 11L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 12) {
+    /* Provide 12/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 12L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 12L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 13) {
+    /* Provide 13/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 13L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 13L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 14) {
+    /* Provide 14/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 14L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 14L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 15) {
+    /* Provide 15/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 15L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 15L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 16/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 16L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 16L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+  /* Recompute dimensions of components */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size;
+  }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->output_width = cinfo->image_width;
+  cinfo->output_height = cinfo->image_height;
+  /* jdinput.c has already initialized DCT_scaled_size,
+   * and has computed unscaled downsampled_width and downsampled_height.
+   */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+
+
 LOCAL(void)
 initial_setup (j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
@@ -70,23 +239,121 @@ initial_setup (j_decompress_ptr cinfo)
 				   compptr->v_samp_factor);
   }
 
-  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
-   * In the full decompressor, this will be overridden by jdmaster.c;
-   * but in the transcoder, jdmaster.c is not used, so we must do it here.
+  /* Derive block_size, natural_order, and lim_Se */
+  if (cinfo->is_baseline || (cinfo->progressive_mode &&
+      cinfo->comps_in_scan)) { /* no pseudo SOS marker */
+    cinfo->block_size = DCTSIZE;
+    cinfo->natural_order = jpeg_natural_order;
+    cinfo->lim_Se = DCTSIZE2-1;
+  } else
+    switch (cinfo->Se) {
+    case (1*1-1):
+      cinfo->block_size = 1;
+      cinfo->natural_order = jpeg_natural_order; /* not needed */
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (2*2-1):
+      cinfo->block_size = 2;
+      cinfo->natural_order = jpeg_natural_order2;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (3*3-1):
+      cinfo->block_size = 3;
+      cinfo->natural_order = jpeg_natural_order3;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (4*4-1):
+      cinfo->block_size = 4;
+      cinfo->natural_order = jpeg_natural_order4;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (5*5-1):
+      cinfo->block_size = 5;
+      cinfo->natural_order = jpeg_natural_order5;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (6*6-1):
+      cinfo->block_size = 6;
+      cinfo->natural_order = jpeg_natural_order6;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (7*7-1):
+      cinfo->block_size = 7;
+      cinfo->natural_order = jpeg_natural_order7;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (8*8-1):
+      cinfo->block_size = 8;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (9*9-1):
+      cinfo->block_size = 9;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (10*10-1):
+      cinfo->block_size = 10;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (11*11-1):
+      cinfo->block_size = 11;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (12*12-1):
+      cinfo->block_size = 12;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (13*13-1):
+      cinfo->block_size = 13;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (14*14-1):
+      cinfo->block_size = 14;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (15*15-1):
+      cinfo->block_size = 15;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (16*16-1):
+      cinfo->block_size = 16;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    default:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+      break;
+    }
+
+  /* We initialize DCT_scaled_size and min_DCT_scaled_size to block_size.
+   * In the full decompressor,
+   * this will be overridden by jpeg_calc_output_dimensions in jdmaster.c;
+   * but in the transcoder,
+   * jpeg_calc_output_dimensions is not used, so we must do it here.
    */
-  cinfo->min_DCT_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = cinfo->block_size;
+  cinfo->min_DCT_v_scaled_size = cinfo->block_size;
 
   /* Compute dimensions of components */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = cinfo->block_size;
+    compptr->DCT_v_scaled_size = cinfo->block_size;
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* downsampled_width and downsampled_height will also be overridden by
      * jdmaster.c if we are doing full decompression.  The transcoder library
      * doesn't use these values, but the calling application might.
@@ -107,7 +374,7 @@ initial_setup (j_decompress_ptr cinfo)
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long) cinfo->image_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+	          (long) (cinfo->max_v_samp_factor * cinfo->block_size));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -138,7 +405,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -161,10 +428,10 @@ per_scan_setup (j_decompress_ptr cinfo)
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     
     cinfo->blocks_in_MCU = 0;
     
@@ -174,7 +441,7 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -282,6 +549,10 @@ finish_input_pass (j_decompress_ptr cinfo)
  * The consume_input method pointer points either here or to the
  * coefficient controller's consume_data routine, depending on whether
  * we are reading a compressed data segment or inter-segment markers.
+ *
+ * Note: This function should NOT return a pseudo SOS marker (with zero
+ * component number) to the caller.  A pseudo marker received by
+ * read_markers is processed and then skipped for other markers.
  */
 
 METHODDEF(int)
@@ -293,41 +564,50 @@ consume_markers (j_decompress_ptr cinfo)
   if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
     return JPEG_REACHED_EOI;
 
-  val = (*cinfo->marker->read_markers) (cinfo);
-
-  switch (val) {
-  case JPEG_REACHED_SOS:	/* Found SOS */
-    if (inputctl->inheaders) {	/* 1st SOS */
-      initial_setup(cinfo);
-      inputctl->inheaders = FALSE;
-      /* Note: start_input_pass must be called by jdmaster.c
-       * before any more input can be consumed.  jdapimin.c is
-       * responsible for enforcing this sequencing.
-       */
-    } else {			/* 2nd or later SOS marker */
-      if (! inputctl->pub.has_multiple_scans)
-	ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
-      start_input_pass(cinfo);
-    }
-    break;
-  case JPEG_REACHED_EOI:	/* Found EOI */
-    inputctl->pub.eoi_reached = TRUE;
-    if (inputctl->inheaders) {	/* Tables-only datastream, apparently */
-      if (cinfo->marker->saw_SOF)
-	ERREXIT(cinfo, JERR_SOF_NO_SOS);
-    } else {
-      /* Prevent infinite loop in coef ctlr's decompress_data routine
-       * if user set output_scan_number larger than number of scans.
-       */
-      if (cinfo->output_scan_number > cinfo->input_scan_number)
-	cinfo->output_scan_number = cinfo->input_scan_number;
+  for (;;) {			/* Loop to pass pseudo SOS marker */
+    val = (*cinfo->marker->read_markers) (cinfo);
+
+    switch (val) {
+    case JPEG_REACHED_SOS:	/* Found SOS */
+      if (inputctl->inheaders) { /* 1st SOS */
+	if (inputctl->inheaders == 1)
+	  initial_setup(cinfo);
+	if (cinfo->comps_in_scan == 0) { /* pseudo SOS marker */
+	  inputctl->inheaders = 2;
+	  break;
+	}
+	inputctl->inheaders = 0;
+	/* Note: start_input_pass must be called by jdmaster.c
+	 * before any more input can be consumed.  jdapimin.c is
+	 * responsible for enforcing this sequencing.
+	 */
+      } else {			/* 2nd or later SOS marker */
+	if (! inputctl->pub.has_multiple_scans)
+	  ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
+	if (cinfo->comps_in_scan == 0) /* unexpected pseudo SOS marker */
+	  break;
+	start_input_pass(cinfo);
+      }
+      return val;
+    case JPEG_REACHED_EOI:	/* Found EOI */
+      inputctl->pub.eoi_reached = TRUE;
+      if (inputctl->inheaders) { /* Tables-only datastream, apparently */
+	if (cinfo->marker->saw_SOF)
+	  ERREXIT(cinfo, JERR_SOF_NO_SOS);
+      } else {
+	/* Prevent infinite loop in coef ctlr's decompress_data routine
+	 * if user set output_scan_number larger than number of scans.
+	 */
+	if (cinfo->output_scan_number > cinfo->input_scan_number)
+	  cinfo->output_scan_number = cinfo->input_scan_number;
+      }
+      return val;
+    case JPEG_SUSPENDED:
+      return val;
+    default:
+      return val;
     }
-    break;
-  case JPEG_SUSPENDED:
-    break;
   }
-
-  return val;
 }
 
 
@@ -343,7 +623,7 @@ reset_input_controller (j_decompress_ptr cinfo)
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
   /* Reset other modules */
   (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
   (*cinfo->marker->reset_marker_reader) (cinfo);
@@ -377,5 +657,5 @@ jinit_input_controller (j_decompress_ptr cinfo)
    */
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
 }
diff --git a/jpeg/jdmainct.c b/jpeg/jdmainct.c
index da19c7e5e..02723ca73 100644
--- a/jpeg/jdmainct.c
+++ b/jpeg/jdmainct.c
@@ -159,24 +159,24 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
  * This is done only once, not once per pass.
  */
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
 
   /* Get top-level space for component array pointers.
    * We alloc both arrays with one call to save a few cycles.
    */
-  jmain->xbuffer[0] = (JSAMPIMAGE)
+  main->xbuffer[0] = (JSAMPIMAGE)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
-  jmain->xbuffer[1] = jmain->xbuffer[0] + cinfo->num_components;
+  main->xbuffer[1] = main->xbuffer[0] + cinfo->num_components;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
@@ -184,9 +184,9 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
     xbuf += rgroup;		/* want one row group at negative offsets */
-    jmain->xbuffer[0][ci] = xbuf;
+    main->xbuffer[0][ci] = xbuf;
     xbuf += rgroup * (M + 4);
-    jmain->xbuffer[1][ci] = xbuf;
+    main->xbuffer[1][ci] = xbuf;
   }
 }
 
@@ -194,26 +194,26 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 LOCAL(void)
 make_funny_pointers (j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
- * The actual workspace is already allocated (in jmain->buffer),
+ * The actual workspace is already allocated (in main->buffer),
  * and the space for the pointer lists is allocated too.
  * This routine just fills in the curiously ordered lists.
  * This will be repeated at the beginning of each pass.
  */
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY buf, xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = jmain->xbuffer[0][ci];
-    xbuf1 = jmain->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = main->xbuffer[0][ci];
+    xbuf1 = main->xbuffer[1][ci];
     /* First copy the workspace pointers as-is */
-    buf = jmain->buffer[ci];
+    buf = main->buffer[ci];
     for (i = 0; i < rgroup * (M + 2); i++) {
       xbuf0[i] = xbuf1[i] = buf[i];
     }
@@ -240,18 +240,18 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
  * This changes the pointer list state from top-of-image to the normal state.
  */
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = jmain->xbuffer[0][ci];
-    xbuf1 = jmain->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = main->xbuffer[0][ci];
+    xbuf1 = main->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
       xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
       xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
@@ -269,7 +269,7 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
  */
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
@@ -277,8 +277,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
+    rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
     /* Count nondummy sample rows remaining for this component */
     rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
@@ -286,12 +286,12 @@ set_bottom_pointers (j_decompress_ptr cinfo)
      * so we need only do it once.
      */
     if (ci == 0) {
-      jmain->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      main->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
     }
     /* Duplicate the last real sample row rgroup*2 times; this pads out the
      * last partial rowgroup and ensures at least one full rowgroup of context.
      */
-    xbuf = jmain->xbuffer[jmain->whichptr][ci];
+    xbuf = main->xbuffer[main->whichptr][ci];
     for (i = 0; i < rgroup * 2; i++) {
       xbuf[rows_left + i] = xbuf[rows_left-1];
     }
@@ -306,27 +306,27 @@ set_bottom_pointers (j_decompress_ptr cinfo)
 METHODDEF(void)
 start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
     if (cinfo->upsample->need_context_rows) {
-      jmain->pub.process_data = process_data_context_main;
+      main->pub.process_data = process_data_context_main;
       make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
-      jmain->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
-      jmain->context_state = CTX_PREPARE_FOR_IMCU;
-      jmain->iMCU_row_ctr = 0;
+      main->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
+      main->context_state = CTX_PREPARE_FOR_IMCU;
+      main->iMCU_row_ctr = 0;
     } else {
       /* Simple case with no context needed */
-      jmain->pub.process_data = process_data_simple_main;
+      main->pub.process_data = process_data_simple_main;
     }
-    jmain->buffer_full = FALSE;	/* Mark buffer empty */
-    jmain->rowgroup_ctr = 0;
+    main->buffer_full = FALSE;	/* Mark buffer empty */
+    main->rowgroup_ctr = 0;
     break;
 #ifdef QUANT_2PASS_SUPPORTED
   case JBUF_CRANK_DEST:
     /* For last pass of 2-pass quantization, just crank the postprocessor */
-    jmain->pub.process_data = process_data_crank_post;
+    main->pub.process_data = process_data_crank_post;
     break;
 #endif
   default:
@@ -346,32 +346,32 @@ process_data_simple_main (j_decompress_ptr cinfo,
 			  JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
 			  JDIMENSION out_rows_avail)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   JDIMENSION rowgroups_avail;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! jmain->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, jmain->buffer))
+  if (! main->buffer_full) {
+    if (! (*cinfo->coef->decompress_data) (cinfo, main->buffer))
       return;			/* suspension forced, can do nothing more */
-    jmain->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    main->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
    */
 
   /* Feed the postprocessor */
-  (*cinfo->post->post_process_data) (cinfo, jmain->buffer,
-				     &jmain->rowgroup_ctr, rowgroups_avail,
+  (*cinfo->post->post_process_data) (cinfo, main->buffer,
+				     &main->rowgroup_ctr, rowgroups_avail,
 				     output_buf, out_row_ctr, out_rows_avail);
 
   /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
-  if (jmain->rowgroup_ctr >= rowgroups_avail) {
-    jmain->buffer_full = FALSE;
-    jmain->rowgroup_ctr = 0;
+  if (main->rowgroup_ctr >= rowgroups_avail) {
+    main->buffer_full = FALSE;
+    main->rowgroup_ctr = 0;
   }
 }
 
@@ -386,15 +386,15 @@ process_data_context_main (j_decompress_ptr cinfo,
 			   JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
 			   JDIMENSION out_rows_avail)
 {
-  my_main_ptr jmain = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! jmain->buffer_full) {
+  if (! main->buffer_full) {
     if (! (*cinfo->coef->decompress_data) (cinfo,
-					   jmain->xbuffer[jmain->whichptr]))
+					   main->xbuffer[main->whichptr]))
       return;			/* suspension forced, can do nothing more */
-    jmain->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
-    jmain->iMCU_row_ctr++;	/* count rows received */
+    main->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    main->iMCU_row_ctr++;	/* count rows received */
   }
 
   /* Postprocessor typically will not swallow all the input data it is handed
@@ -402,47 +402,47 @@ process_data_context_main (j_decompress_ptr cinfo,
    * to exit and restart.  This switch lets us keep track of how far we got.
    * Note that each case falls through to the next on successful completion.
    */
-  switch (jmain->context_state) {
+  switch (main->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, jmain->xbuffer[jmain->whichptr],
-			&jmain->rowgroup_ctr, jmain->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, main->xbuffer[main->whichptr],
+			&main->rowgroup_ctr, main->rowgroups_avail,
 			output_buf, out_row_ctr, out_rows_avail);
-    if (jmain->rowgroup_ctr < jmain->rowgroups_avail)
+    if (main->rowgroup_ctr < main->rowgroups_avail)
       return;			/* Need to suspend */
-    jmain->context_state = CTX_PREPARE_FOR_IMCU;
+    main->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;			/* Postprocessor exactly filled output buf */
     /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
-    jmain->rowgroup_ctr = 0;
-    jmain->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    main->rowgroup_ctr = 0;
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
-    if (jmain->iMCU_row_ctr == cinfo->total_iMCU_rows)
+    if (main->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
-    jmain->context_state = CTX_PROCESS_IMCU;
+    main->context_state = CTX_PROCESS_IMCU;
     /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, jmain->xbuffer[jmain->whichptr],
-			&jmain->rowgroup_ctr, jmain->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, main->xbuffer[main->whichptr],
+			&main->rowgroup_ctr, main->rowgroups_avail,
 			output_buf, out_row_ctr, out_rows_avail);
-    if (jmain->rowgroup_ctr < jmain->rowgroups_avail)
+    if (main->rowgroup_ctr < main->rowgroups_avail)
       return;			/* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
-    if (jmain->iMCU_row_ctr == 1)
+    if (main->iMCU_row_ctr == 1)
       set_wraparound_pointers(cinfo);
     /* Prepare to load new iMCU row using other xbuffer list */
-    jmain->whichptr ^= 1;	/* 0=>1 or 1=>0 */
-    jmain->buffer_full = FALSE;
+    main->whichptr ^= 1;	/* 0=>1 or 1=>0 */
+    main->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    jmain->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    jmain->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
-    jmain->context_state = CTX_POSTPONED_ROW;
+    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2);
+    main->context_state = CTX_POSTPONED_ROW;
   }
 }
 
@@ -475,15 +475,15 @@ process_data_crank_post (j_decompress_ptr cinfo,
 GLOBAL(void)
 jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr jmain;
+  my_main_ptr main;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
-  jmain = (my_main_ptr)
+  main = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) jmain;
-  jmain->pub.start_pass = start_pass_main;
+  cinfo->main = (struct jpeg_d_main_controller *) main;
+  main->pub.start_pass = start_pass_main;
 
   if (need_full_buffer)		/* shouldn't happen */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -492,21 +492,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    * ngroups is the number of row groups we need.
    */
   if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */
       ERREXIT(cinfo, JERR_NOTIMPL);
     alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->min_DCT_v_scaled_size + 2;
   } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->min_DCT_v_scaled_size;
   }
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    jmain->buffer[ci] = (*cinfo->mem->alloc_sarray)
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->DCT_scaled_size,
+			 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 			 (JDIMENSION) (rgroup * ngroups));
   }
 }
diff --git a/jpeg/jdmarker.c b/jpeg/jdmarker.c
index f4cca8cc8..f2a9cc429 100644
--- a/jpeg/jdmarker.c
+++ b/jpeg/jdmarker.c
@@ -2,6 +2,7 @@
  * jdmarker.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -234,7 +235,8 @@ get_soi (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof (j_decompress_ptr cinfo, boolean is_baseline, boolean is_prog,
+	 boolean is_arith)
 /* Process a SOFn marker */
 {
   INT32 length;
@@ -242,6 +244,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   jpeg_component_info * compptr;
   INPUT_VARS(cinfo);
 
+  cinfo->is_baseline = is_baseline;
   cinfo->progressive_mode = is_prog;
   cinfo->arith_code = is_arith;
 
@@ -315,7 +318,9 @@ get_sos (j_decompress_ptr cinfo)
 
   TRACEMS1(cinfo, 1, JTRC_SOS, n);
 
-  if (length != (n * 2 + 6) || n < 1 || n > MAX_COMPS_IN_SCAN)
+  if (length != (n * 2 + 6) || n > MAX_COMPS_IN_SCAN ||
+      (n == 0 && !cinfo->progressive_mode))
+      /* pseudo SOS marker only allowed in progressive mode */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   cinfo->comps_in_scan = n;
@@ -359,8 +364,8 @@ get_sos (j_decompress_ptr cinfo)
   /* Prepare to scan data & restart markers */
   cinfo->marker->next_restart_num = 0;
 
-  /* Count another SOS marker */
-  cinfo->input_scan_number++;
+  /* Count another (non-pseudo) SOS marker */
+  if (n) cinfo->input_scan_number++;
 
   INPUT_SYNC(cinfo);
   return TRUE;
@@ -490,16 +495,18 @@ LOCAL(boolean)
 get_dqt (j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
-  INT32 length;
-  int n, i, prec;
+  INT32 length, count, i;
+  int n, prec;
   unsigned int tmp;
   JQUANT_TBL *quant_ptr;
+  const int *natural_order;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
 
   while (length > 0) {
+    length--;
     INPUT_BYTE(cinfo, n, return FALSE);
     prec = n >> 4;
     n &= 0x0F;
@@ -513,13 +520,43 @@ get_dqt (j_decompress_ptr cinfo)
       cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
-    for (i = 0; i < DCTSIZE2; i++) {
+    if (prec) {
+      if (length < DCTSIZE2 * 2) {
+	/* Initialize full table for safety. */
+	for (i = 0; i < DCTSIZE2; i++) {
+	  quant_ptr->quantval[i] = 1;
+	}
+	count = length >> 1;
+      } else
+	count = DCTSIZE2;
+    } else {
+      if (length < DCTSIZE2) {
+	/* Initialize full table for safety. */
+	for (i = 0; i < DCTSIZE2; i++) {
+	  quant_ptr->quantval[i] = 1;
+	}
+	count = length;
+      } else
+	count = DCTSIZE2;
+    }
+
+    switch (count) {
+    case (2*2): natural_order = jpeg_natural_order2; break;
+    case (3*3): natural_order = jpeg_natural_order3; break;
+    case (4*4): natural_order = jpeg_natural_order4; break;
+    case (5*5): natural_order = jpeg_natural_order5; break;
+    case (6*6): natural_order = jpeg_natural_order6; break;
+    case (7*7): natural_order = jpeg_natural_order7; break;
+    default:    natural_order = jpeg_natural_order;  break;
+    }
+
+    for (i = 0; i < count; i++) {
       if (prec)
 	INPUT_2BYTES(cinfo, tmp, return FALSE);
       else
 	INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[natural_order[i]] = (UINT16) tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
@@ -532,8 +569,8 @@ get_dqt (j_decompress_ptr cinfo)
       }
     }
 
-    length -= DCTSIZE2+1;
-    if (prec) length -= DCTSIZE2;
+    length -= count;
+    if (prec) length -= count;
   }
 
   if (length != 0)
@@ -946,6 +983,11 @@ first_marker (j_decompress_ptr cinfo)
  *
  * Returns same codes as are defined for jpeg_consume_input:
  * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ *
+ * Note: This function may return a pseudo SOS marker (with zero
+ * component number) for treat by input controller's consume_input.
+ * consume_input itself should filter out (skip) the pseudo marker
+ * after processing for the caller.
  */
 
 METHODDEF(int)
@@ -975,23 +1017,27 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_SOF0:		/* Baseline */
+      if (! get_sof(cinfo, TRUE, FALSE, FALSE))
+	return JPEG_SUSPENDED;
+      break;
+
     case M_SOF1:		/* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (! get_sof(cinfo, FALSE, FALSE, FALSE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:		/* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (! get_sof(cinfo, FALSE, TRUE, FALSE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:		/* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (! get_sof(cinfo, FALSE, FALSE, TRUE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:		/* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (! get_sof(cinfo, FALSE, TRUE, TRUE))
 	return JPEG_SUSPENDED;
       break;
 
diff --git a/jpeg/jdmaster.c b/jpeg/jdmaster.c
index 2802c5b7b..8c1146e4f 100644
--- a/jpeg/jdmaster.c
+++ b/jpeg/jdmaster.c
@@ -2,6 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -61,9 +62,12 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[2].v_samp_factor != 1)
     return FALSE;
   /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size)
     return FALSE;
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;			/* by golly, it'll work... */
@@ -82,7 +86,9 @@ use_merged_upsample (j_decompress_ptr cinfo)
 
 GLOBAL(void)
 jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
-/* Do computations that are needed before master selection phase */
+/* Do computations that are needed before master selection phase.
+ * This function is used for full decompression.
+ */
 {
 #ifdef IDCT_SCALING_SUPPORTED
   int ci;
@@ -93,52 +99,38 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
   if (cinfo->global_state != DSTATE_READY)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
+  /* Compute core output image dimensions and DCT scaling choices. */
+  jpeg_core_output_dimensions(cinfo);
+
 #ifdef IDCT_SCALING_SUPPORTED
 
-  /* Compute actual output image dimensions and DCT scaling choices. */
-  if (cinfo->scale_num * 8 <= cinfo->scale_denom) {
-    /* Provide 1/8 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 8L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 8L);
-    cinfo->min_DCT_scaled_size = 1;
-  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
-    /* Provide 1/4 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 4L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 4L);
-    cinfo->min_DCT_scaled_size = 2;
-  } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
-    /* Provide 1/2 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 2L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 2L);
-    cinfo->min_DCT_scaled_size = 4;
-  } else {
-    /* Provide 1/1 scaling */
-    cinfo->output_width = cinfo->image_width;
-    cinfo->output_height = cinfo->image_height;
-    cinfo->min_DCT_scaled_size = DCTSIZE;
-  }
   /* In selecting the actual DCT scaling for each component, we try to
    * scale up the chroma components via IDCT scaling rather than upsampling.
    * This saves time if the upsampler gets to use 1:1 scaling.
-   * Note this code assumes that the supported DCT scalings are powers of 2.
+   * Note this code adapts subsampling ratios which are powers of 2.
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
-    while (ssize < DCTSIZE &&
-	   (compptr->h_samp_factor * ssize * 2 <=
-	    cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
-	   (compptr->v_samp_factor * ssize * 2 <=
-	    cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+    int ssize = 1;
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
       ssize = ssize * 2;
     }
-    compptr->DCT_scaled_size = ssize;
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support IDCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
   }
 
   /* Recompute downsampled dimensions of components;
@@ -149,23 +141,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
   }
 
-#else /* !IDCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->output_width = cinfo->image_width;
-  cinfo->output_height = cinfo->image_height;
-  /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
-   * and has computed unscaled downsampled_width and downsampled_height.
-   */
-
 #endif /* IDCT_SCALING_SUPPORTED */
 
   /* Report number of components in selected colorspace. */
@@ -372,17 +355,10 @@ master_selection (j_decompress_ptr cinfo)
   /* Inverse DCT */
   jinit_inverse_dct(cinfo);
   /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
+    jinit_huff_decoder(cinfo);
   }
 
   /* Initialize principal buffer controllers. */
diff --git a/jpeg/jdphuff.c b/jpeg/jdphuff.c
deleted file mode 100644
index 226780994..000000000
--- a/jpeg/jdphuff.c
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * jdphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy decoding routines for progressive JPEG.
- *
- * Much of the complexity here has to do with supporting input suspension.
- * If the data source module demands suspension, we want to be able to back
- * up to the start of the current MCU.  To do this, we copy state variables
- * into local working storage, and update them back to the permanent
- * storage only upon successful completion of an MCU.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdhuff.c */
-
-
-#ifdef D_PROGRESSIVE_SUPPORTED
-
-/*
- * Expanded entropy decoder object for progressive Huffman decoding.
- *
- * The savable_state subrecord contains fields that change within an MCU,
- * but must not be updated permanently until we complete the MCU.
- */
-
-typedef struct {
-  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
-  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
-} savable_state;
-
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-	((dest).EOBRUN = (src).EOBRUN, \
-	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
-	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
-	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
-	 (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
-typedef struct {
-  struct jpeg_entropy_decoder pub; /* public fields */
-
-  /* These fields are loaded into local variables at start of each MCU.
-   * In case of suspension, we exit WITHOUT updating them.
-   */
-  bitread_perm_state bitstate;	/* Bit buffer at start of MCU */
-  savable_state saved;		/* Other state at start of MCU */
-
-  /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-
-  /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
-} phuff_entropy_decoder;
-
-typedef phuff_entropy_decoder * phuff_entropy_ptr;
-
-/* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_DC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-
-
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band, bad;
-  int ci, coefi, tbl;
-  int *coef_bit_ptr;
-  jpeg_component_info * compptr;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* Validate scan parameters */
-  bad = FALSE;
-  if (is_DC_band) {
-    if (cinfo->Se != 0)
-      bad = TRUE;
-  } else {
-    /* need not check Ss/Se < 0 since they came from unsigned bytes */
-    if (cinfo->Ss > cinfo->Se || cinfo->Se >= DCTSIZE2)
-      bad = TRUE;
-    /* AC scans may have only one component */
-    if (cinfo->comps_in_scan != 1)
-      bad = TRUE;
-  }
-  if (cinfo->Ah != 0) {
-    /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
-      bad = TRUE;
-  }
-  if (cinfo->Al > 13)		/* need not check for < 0 */
-    bad = TRUE;
-  /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
-   * but the spec doesn't say so, and we try to be liberal about what we
-   * accept.  Note: large Al values could result in out-of-range DC
-   * coefficients during early scans, leading to bizarre displays due to
-   * overflows in the IDCT math.  But we won't crash.
-   */
-  if (bad)
-    ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
-	     cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
-  /* Update progression status, and verify that scan order is legal.
-   * Note that inter-scan inconsistencies are treated as warnings
-   * not fatal errors ... not clear if this is right way to behave.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
-    if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
-      WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
-    for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
-      int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
-      if (cinfo->Ah != expected)
-	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
-      coef_bit_ptr[coefi] = cinfo->Al;
-    }
-  }
-
-  /* Select MCU decoding routine */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_first;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_refine;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_refine;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Make sure requested tables are present, and compute derived tables.
-     * We may build same derived table more than once, but it's not expensive.
-     */
-    if (is_DC_band) {
-      if (cinfo->Ah == 0) {	/* DC refinement needs no table */
-	tbl = compptr->dc_tbl_no;
-	jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
-				& entropy->derived_tbls[tbl]);
-      }
-    } else {
-      tbl = compptr->ac_tbl_no;
-      jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
-			      & entropy->derived_tbls[tbl]);
-      /* remember the single active table */
-      entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize private state variables */
-  entropy->saved.EOBRUN = 0;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
-
-
-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
- * Check for a restart marker & resynchronize decoder.
- * Returns FALSE if must suspend.
- */
-
-LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int ci;
-
-  /* Throw away any unused bits remaining in bit buffer; */
-  /* include any full bytes in next_marker's count of discarded bytes */
-  cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
-  entropy->bitstate.bits_left = 0;
-
-  /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
-    return FALSE;
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++)
-    entropy->saved.last_dc_val[ci] = 0;
-  /* Re-init EOB run count, too */
-  entropy->saved.EOBRUN = 0;
-
-  /* Reset restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-
-  /* Reset out-of-data flag, unless read_restart_marker left us smack up
-   * against a marker.  In that case we will end up treating the next data
-   * segment as empty, and we can avoid producing bogus output pixels by
-   * leaving the flag set.
-   */
-  if (cinfo->unread_marker == 0)
-    entropy->pub.insufficient_data = FALSE;
-
-  return TRUE;
-}
-
-
-/*
- * Huffman MCU decoding.
- * Each of these routines decodes and returns one MCU's worth of
- * Huffman-compressed coefficients. 
- * The coefficients are reordered from zigzag order into natural array order,
- * but are not dequantized.
- *
- * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
- *
- * We return FALSE if data source requested suspension.  In that case no
- * changes have been made to permanent state.  (Exception: some output
- * coefficients may already have been assigned.  This is harmless for
- * spectral selection, since we'll just re-assign them on the next call.
- * Successive approximation AC refinement has to be more careful, however.)
- */
-
-/*
- * MCU decoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Al = cinfo->Al;
-  register int s, r;
-  int blkn, ci;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
-
-    /* Outer loop handles each block in the MCU */
-
-    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      block = MCU_data[blkn];
-      ci = cinfo->MCU_membership[blkn];
-      compptr = cinfo->cur_comp_info[ci];
-      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
-
-      /* Decode a single block's worth of coefficients */
-
-      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
-      }
-
-      /* Convert DC difference to actual value, update last_dc_val */
-      s += state.last_dc_val[ci];
-      state.last_dc_val[ci] = s;
-      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) (s << Al);
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state.
-     * We can avoid loading/saving bitread state if in an EOB run.
-     */
-    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-
-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
-      EOBRUN--;			/* ...process it now (we do nothing) */
-    else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-      block = MCU_data[0];
-      tbl = entropy->ac_derived_tbl;
-
-      for (k = cinfo->Ss; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  k += r;
-	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	  r = GET_BITS(s);
-	  s = HUFF_EXTEND(r, s);
-	  /* Scale and output coefficient in natural (dezigzagged) order */
-	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
-	} else {
-	  if (r == 15) {	/* ZRL */
-	    k += 15;		/* skip 15 zeroes in band */
-	  } else {		/* EOBr, run length is 2^r + appended bits */
-	    EOBRUN = 1 << r;
-	    if (r) {		/* EOBr, r > 0 */
-	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    EOBRUN--;		/* this band is processed at this moment */
-	    break;		/* force end-of-band */
-	  }
-	}
-      }
-
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    }
-
-    /* Completed MCU, so update state */
-    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int blkn;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* Not worth the cycles to check insufficient_data here,
-   * since we will not change the data anyway if we read zeroes.
-   */
-
-  /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-
-  /* Outer loop handles each block in the MCU */
-
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* Encoded data is simply the next bit of the two's-complement DC value */
-    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
-    if (GET_BITS(1))
-      (*block)[0] |= p1;
-    /* Note: since we use |=, repeating the assignment later is safe */
-  }
-
-  /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  JCOEFPTR thiscoef;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-  int num_newnz;
-  int newnz_pos[DCTSIZE2];
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, don't modify the MCU.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-    block = MCU_data[0];
-    tbl = entropy->ac_derived_tbl;
-
-    /* If we are forced to suspend, we must undo the assignments to any newly
-     * nonzero coefficients in the block, because otherwise we'd get confused
-     * next time about which coefficients were already nonzero.
-     * But we need not undo addition of bits to already-nonzero coefficients;
-     * instead, we can test the current bit to see if we already did it.
-     */
-    num_newnz = 0;
-
-    /* initialize coefficient loop counter to start of band */
-    k = cinfo->Ss;
-
-    if (EOBRUN == 0) {
-      for (; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  if (s != 1)		/* size of new coef should always be 1 */
-	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1))
-	    s = p1;		/* newly nonzero coef is positive */
-	  else
-	    s = m1;		/* newly nonzero coef is negative */
-	} else {
-	  if (r != 15) {
-	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
-	    if (r) {
-	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    break;		/* rest of block is handled by EOB logic */
-	  }
-	  /* note s = 0 for processing ZRL */
-	}
-	/* Advance over already-nonzero coefs and r still-zero coefs,
-	 * appending correction bits to the nonzeroes.  A correction bit is 1
-	 * if the absolute value of the coefficient must be increased.
-	 */
-	do {
-	  thiscoef = *block + jpeg_natural_order[k];
-	  if (*thiscoef != 0) {
-	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	    if (GET_BITS(1)) {
-	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-		if (*thiscoef >= 0)
-		  *thiscoef += p1;
-		else
-		  *thiscoef += m1;
-	      }
-	    }
-	  } else {
-	    if (--r < 0)
-	      break;		/* reached target zero coefficient */
-	  }
-	  k++;
-	} while (k <= Se);
-	if (s) {
-	  int pos = jpeg_natural_order[k];
-	  /* Output newly nonzero coefficient */
-	  (*block)[pos] = (JCOEF) s;
-	  /* Remember its position in case we have to suspend */
-	  newnz_pos[num_newnz++] = pos;
-	}
-      }
-    }
-
-    if (EOBRUN > 0) {
-      /* Scan any remaining coefficient positions after the end-of-band
-       * (the last newly nonzero coefficient, if any).  Append a correction
-       * bit to each already-nonzero coefficient.  A correction bit is 1
-       * if the absolute value of the coefficient must be increased.
-       */
-      for (; k <= Se; k++) {
-	thiscoef = *block + jpeg_natural_order[k];
-	if (*thiscoef != 0) {
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1)) {
-	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-	      if (*thiscoef >= 0)
-		*thiscoef += p1;
-	      else
-		*thiscoef += m1;
-	    }
-	  }
-	}
-      }
-      /* Count one block completed in EOB run */
-      EOBRUN--;
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-
-undoit:
-  /* Re-zero any output coefficients that we made newly nonzero */
-  while (num_newnz > 0)
-    (*block)[newnz_pos[--num_newnz]] = 0;
-
-  return FALSE;
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy decoding.
- */
-
-GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int *coef_bit_ptr;
-  int ci, i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff_decoder;
-
-  /* Mark derived tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-  }
-
-  /* Create progression status table */
-  cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				cinfo->num_components*DCTSIZE2*SIZEOF(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
-  for (ci = 0; ci < cinfo->num_components; ci++) 
-    for (i = 0; i < DCTSIZE2; i++)
-      *coef_bit_ptr++ = -1;
-}
-
-#endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/jpeg/jdsample.c b/jpeg/jdsample.c
index 80ffefb2a..7bc8885b0 100644
--- a/jpeg/jdsample.c
+++ b/jpeg/jdsample.c
@@ -2,13 +2,14 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains upsampling routines.
  *
  * Upsampling input data is counted in "row groups".  A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * is defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
  * sample rows of each component.  Upsampling will normally produce
  * max_v_samp_factor pixel rows from each row group (but this could vary
  * if the upsampler is applying a scale factor of its own).
@@ -237,11 +238,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
   JSAMPROW outend;
-  int inrow;
+  int outrow;
 
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
+  for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) {
+    inptr = input_data[outrow];
+    outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
       invalue = *inptr++;	/* don't need GETJSAMPLE() here */
@@ -285,112 +286,6 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 }
 
 
-/*
- * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
- *
- * The upsampling algorithm is linear interpolation between pixel centers,
- * also known as a "triangle filter".  This is a good compromise between
- * speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
- * of the way between input pixel centers.
- *
- * A note about the "bias" calculations: when rounding fractional values to
- * integer, we do not want to always round 0.5 up to the next integer.
- * If we did that, we'd introduce a noticeable bias towards larger values.
- * Instead, this code is arranged so that 0.5 will be rounded up or down at
- * alternate pixel locations (a simple ordered dither pattern).
- */
-
-METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register int invalue;
-  register JDIMENSION colctr;
-  int inrow;
-
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
-
-    for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-      /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
-    }
-
-    /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
-  }
-}
-
-
-/*
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- * Again a triangle filter; see comments for h2v1 case, above.
- *
- * It is OK for us to reference the adjacent input rows because we demanded
- * context from the main buffer controller (see initialization code).
- */
-
-METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr0, inptr1, outptr;
-#if BITS_IN_JSAMPLE == 8
-  register int thiscolsum, lastcolsum, nextcolsum;
-#else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
-#endif
-  register JDIMENSION colctr;
-  int inrow, outrow, v;
-
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
-    for (v = 0; v < 2; v++) {
-      /* inptr0 points to nearest input row, inptr1 points to next nearest */
-      inptr0 = input_data[inrow];
-      if (v == 0)		/* next nearest is row above */
-	inptr1 = input_data[inrow-1];
-      else			/* next nearest is row below */
-	inptr1 = input_data[inrow+1];
-      outptr = output_data[outrow++];
-
-      /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-
-      for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-	/* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
-	/* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-	nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-	lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-      }
-
-      /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
-    }
-    inrow++;
-  }
-}
-
-
 /*
  * Module initialization routine for upsampling.
  */
@@ -401,7 +296,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
   my_upsample_ptr upsample;
   int ci;
   jpeg_component_info * compptr;
-  boolean need_buffer, do_fancy;
+  boolean need_buffer;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
   upsample = (my_upsample_ptr)
@@ -415,11 +310,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
   if (cinfo->CCIR601_sampling)	/* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
 
-  /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
-   * so don't ask for it.
-   */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
-
   /* Verify we can handle the sampling factors, select per-component methods,
    * and create storage as needed.
    */
@@ -428,10 +318,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
     /* Compute size of an "input group" after IDCT scaling.  This many samples
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		 cinfo->min_DCT_h_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		 cinfo->min_DCT_v_scaled_size;
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -446,19 +336,12 @@ jinit_upsampler (j_decompress_ptr cinfo)
       need_buffer = FALSE;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
-      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      /* Special case for 2h1v upsampling */
+      upsample->methods[ci] = h2v1_upsample;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
-      /* Special cases for 2h2v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
-	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      /* Special case for 2h2v upsampling */
+      upsample->methods[ci] = h2v2_upsample;
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
diff --git a/jpeg/jdtrans.c b/jpeg/jdtrans.c
index 6c0ab715d..22dd47fb5 100644
--- a/jpeg/jdtrans.c
+++ b/jpeg/jdtrans.c
@@ -2,6 +2,7 @@
  * jdtrans.c
  *
  * Copyright (C) 1995-1997, Thomas G. Lane.
+ * Modified 2000-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -99,18 +100,14 @@ transdecode_master_selection (j_decompress_ptr cinfo)
   /* This is effectively a buffered-image operation. */
   cinfo->buffered_image = TRUE;
 
+  /* Compute output image dimensions and related values. */
+  jpeg_core_output_dimensions(cinfo);
+
   /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
+    jinit_huff_decoder(cinfo);
   }
 
   /* Always get a full-image coefficient buffer. */
diff --git a/jpeg/jerror.c b/jpeg/jerror.c
index a44463cf1..3da7be86a 100644
--- a/jpeg/jerror.c
+++ b/jpeg/jerror.c
@@ -23,7 +23,6 @@
 #include "jpeglib.h"
 #include "jversion.h"
 #include "jerror.h"
-#include <stdlib.h>
 
 #ifdef USE_WINDOWS_MESSAGEBOX
 #include <windows.h>
diff --git a/jpeg/jerror.h b/jpeg/jerror.h
index fc2fffeac..1cfb2b19d 100644
--- a/jpeg/jerror.h
+++ b/jpeg/jerror.h
@@ -2,6 +2,7 @@
  * jerror.h
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -39,14 +40,15 @@ typedef enum {
 JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
-JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, there are legal restrictions on arithmetic coding")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
-JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported")
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+	 "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
 JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
 JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
@@ -93,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -170,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
 	 "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
@@ -227,6 +231,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
 #define ERREXITS(cinfo,code,str)  \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/jpeg/jfdctflt.c b/jpeg/jfdctflt.c
index 79d7a0078..74d0d862d 100644
--- a/jpeg/jfdctflt.c
+++ b/jpeg/jfdctflt.c
@@ -2,6 +2,7 @@
  * jfdctflt.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -56,41 +57,46 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
   FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
   FAST_FLOAT *dataptr;
+  JSAMPROW elemptr;
   int ctr;
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]));
+    tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]));
+    tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]));
+    tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]));
+    tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]));
+    tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]));
+    tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]));
+    tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]));
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -126,21 +132,21 @@ jpeg_fdct_float (FAST_FLOAT * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/jpeg/jfdctfst.c b/jpeg/jfdctfst.c
index ccb378a3b..8cad5f229 100644
--- a/jpeg/jfdctfst.c
+++ b/jpeg/jfdctfst.c
@@ -2,6 +2,7 @@
  * jfdctfst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -111,42 +112,47 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
   DCTELEM z1, z2, z3, z4, z5, z11, z13;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+    tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -182,21 +188,21 @@ jpeg_fdct_ifast (DCTELEM * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/jpeg/jfdctint.c b/jpeg/jfdctint.c
index 0a78b64ae..1dde58c49 100644
--- a/jpeg/jfdctint.c
+++ b/jpeg/jfdctint.c
@@ -2,6 +2,7 @@
  * jfdctint.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modification developed 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +22,23 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide FDCT routines with various input sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
+ *
+ * For N<8 we fill the remaining block coefficients with zero.
+ * For N>8 we apply a partial N-point FDCT on the input samples, computing
+ * just the lower 8 frequency coefficients and discarding the rest.
+ *
+ * We must scale the output coefficients of the N-point FDCT appropriately
+ * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
+ * is folded into the constant multipliers (pass 2) and/or final/initial
+ * shifting.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -36,7 +54,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -137,12 +155,13 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
@@ -151,62 +170,74 @@ jpeg_fdct_islow (DCTELEM * data)
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
     tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-				   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-				   CONST_BITS-PASS1_BITS);
-    
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
     dataptr += DCTSIZE;		/* advance pointer to next row */
   }
 
@@ -217,67 +248,4101 @@ jpeg_fdct_islow (DCTELEM * data)
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
-    /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
-     */
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
-    
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-					   CONST_BITS+PASS1_BITS);
-    
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-					   CONST_BITS+PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+#ifdef DCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform the forward DCT on a 7x7 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)**2 = 64/49, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x6 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)**2 = 16/9, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x5 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)**2 = 64/25, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x4 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-2);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
     dataptr++;			/* advance pointer to next column */
   }
 }
 
+
+/*
+ * Perform the forward DCT on a 3x3 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2**2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/3)**2 = 64/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x2 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+
+  /* Row 0 */
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Row 1 */
+  elemptr = sample_data[1] + start_col;
+
+  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)**2 = 2**4.
+   */
+
+  /* Column 0 */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
+
+  /* Column 1 */
+  data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
+  data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
+}
+
+
+/*
+ * Perform the forward DCT on a 1x1 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* We leave the result scaled up by an overall factor of 8. */
+  /* We must also scale the output by (8/1)**2 = 2**6. */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM)
+    ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
+}
+
+
+/*
+ * Perform the forward DCT on a 9x9 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2;
+  DCTELEM workspace[8];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/18). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
+    tmp4 = GETJSAMPLE(elemptr[4]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
+	      CONST_BITS-1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
+	      + z1 + z2, CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
+	      + z1 - z2, CONST_BITS-1);
+
+    /* Odd part */
+
+    dataptr[3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
+	      CONST_BITS-1);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
+
+    dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 9)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/9)**2 = 64/81, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
+    tmp4 = dataptr[DCTSIZE*4];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
+	      CONST_BITS+2);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
+	      + z1 + z2, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
+	      + z1 - z2, CONST_BITS+2);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
+	      CONST_BITS+2);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
+
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x10 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-1);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-1);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/10)**2 = 16/25, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+2);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+2);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+2);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+2);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 11x11 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*3];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/22). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
+    tmp5 = GETJSAMPLE(elemptr[5]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
+	      CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 11)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/11)**2 = 64/121, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
+    tmp5 = dataptr[DCTSIZE*5];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
+    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
+		       FIX(1.057851240)),                /* 128/121 */
+	      CONST_BITS+2);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x12 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
+    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/12)**2 = 4/9, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 13x13 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 z1, z2;
+  DCTELEM workspace[8*5];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/26). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
+    tmp6 = GETJSAMPLE(elemptr[6]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
+	      CONST_BITS);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
+
+    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 13)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/13)**2 = 64/169, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
+    tmp6 = dataptr[DCTSIZE*6];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
+    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
+		       FIX(0.757396450)),          /* 128/169 */
+	      CONST_BITS+1);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
+	      CONST_BITS+1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
+
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x14 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/14)**2 = 16/49, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+1);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+1);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 15x15 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*7];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/30). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
+    tmp7 = GETJSAMPLE(elemptr[7]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
+    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
+    z3 += z3;
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
+	      CONST_BITS);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
+
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.224744871));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 15)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/15)**2 = 64/225, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
+    tmp7 = dataptr[DCTSIZE*7];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
+    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
+    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
+	      CONST_BITS+2);
+    z3 += z3;
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
+	      CONST_BITS+2);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.393487498));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x16 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/16)**2 = 1/2**2.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+2);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
+	      CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+2);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x8 sample block.
+ *
+ * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+					   CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x7 sample block.
+ *
+ * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero bottom row of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS+1);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x6 sample block.
+ *
+ * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 2 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x5 sample block.
+ *
+ * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 3 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/10)*(8/5) = 32/25, which we
+   * fold into the constant multipliers:
+   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 8x4 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 4 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS-1);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x3 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS-1);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x2 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
+  /* 4-point FDCT kernel, */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-3);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x1 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]);
+  tmp1 = GETJSAMPLE(elemptr[1]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)*(8/1) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+
+/*
+ * Perform the forward DCT on an 8x16 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				   CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				   CONST_BITS-PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+1);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 7x14 sample block.
+ *
+ * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)*(8/14) = 32/49, which we
+   * fold into the constant multipliers:
+   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x12 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/12) = 8/9, which we
+   * fold into the constant multipliers:
+   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x10 sample block.
+ *
+ * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)*(8/10) = 32/25, which we
+   * fold into the constant multipliers:
+   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x8 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+  /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x6 sample block.
+ *
+ * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x4 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
+    dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 1x2 sample block.
+ *
+ * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
+  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/1)*(8/2) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+#endif /* DCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/jpeg/jidctflt.c b/jpeg/jidctflt.c
index 0188ce3df..23ae9d333 100644
--- a/jpeg/jidctflt.c
+++ b/jpeg/jidctflt.c
@@ -2,6 +2,7 @@
  * jidctflt.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -76,10 +77,9 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   FLOAT_MULT_TYPE * quantptr;
   FAST_FLOAT * wsptr;
   JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
@@ -152,12 +152,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;	/* phase 2 */
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;
 
     wsptr[DCTSIZE*0] = tmp0 + tmp7;
     wsptr[DCTSIZE*7] = tmp0 - tmp7;
@@ -165,8 +165,8 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     wsptr[DCTSIZE*6] = tmp1 - tmp6;
     wsptr[DCTSIZE*2] = tmp2 + tmp5;
     wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*4] = tmp3 + tmp4;
-    wsptr[DCTSIZE*3] = tmp3 - tmp4;
+    wsptr[DCTSIZE*3] = tmp3 + tmp4;
+    wsptr[DCTSIZE*4] = tmp3 - tmp4;
 
     inptr++;			/* advance pointers to next column */
     quantptr++;
@@ -174,7 +174,6 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   }
   
   /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3. */
 
   wsptr = workspace;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
@@ -187,8 +186,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     
     /* Even part */
 
-    tmp10 = wsptr[0] + wsptr[4];
-    tmp11 = wsptr[0] - wsptr[4];
+    /* Apply signed->unsigned and prepare float->int conversion */
+    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    tmp10 = z5 + wsptr[4];
+    tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
     tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
@@ -209,31 +210,23 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
-			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
-			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
-			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
-			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
-			    & RANGE_MASK];
+    tmp4 = tmp10 - tmp5;
+
+    /* Final output stage: float->int conversion and range-limit */
+
+    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
     
     wsptr += DCTSIZE;		/* advance pointer to next row */
   }
diff --git a/jpeg/jidctint.c b/jpeg/jidctint.c
index a72b3207c..dcdf7ce45 100644
--- a/jpeg/jidctint.c
+++ b/jpeg/jidctint.c
@@ -2,6 +2,7 @@
  * jidctint.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modification developed 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -23,6 +24,28 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide IDCT routines with various output sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
+ *
+ * For N<8 we simply take the corresponding low-frequency coefficients of
+ * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
+ * to yield the downscaled outputs.
+ * This can be seen as direct low-pass downsampling from the DCT domain
+ * point of view rather than the usual spatial domain point of view,
+ * yielding significant computational savings and results at least
+ * as good as common bilinear (averaging) spatial downsampling.
+ *
+ * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
+ * lower frequencies and higher frequencies assumed to be zero.
+ * It turns out that the computational effort is similar to the 8x8 IDCT
+ * regarding the output size.
+ * Furthermore, the scaling and descaling is the same for all IDCT sizes.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -38,7 +61,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -151,7 +174,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 {
   INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE * quantptr;
   int * wsptr;
@@ -177,14 +200,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 	inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
@@ -193,82 +216,84 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
+
       inptr++;			/* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
 
-    tmp0 = (z2 + z3) << CONST_BITS;
-    tmp1 = (z2 - z3) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
     
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
     
     inptr++;			/* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
   /* Note that we must descale the results by a factor of 8 == 2**3, */
   /* and also undo the PASS1_BITS scaling. */
@@ -283,14 +308,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * test takes more time than it's worth.  In that case this section
      * may be commented out.
      */
-    
+
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 				  & RANGE_MASK];
-      
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
@@ -304,86 +329,4809 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       continue;
     }
 #endif
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = (INT32) wsptr[2];
     z3 = (INT32) wsptr[6];
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
-    tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
-    tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
     
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = (INT32) wsptr[7];
     tmp1 = (INT32) wsptr[5];
     tmp2 = (INT32) wsptr[3];
     tmp3 = (INT32) wsptr[1];
-    
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    
+
     wsptr += DCTSIZE;		/* advance pointer to next row */
   }
 }
 
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x7 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/14).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x6 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/12).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) (tmp11 + tmp1);
+    wsptr[6*4] = (int) (tmp11 - tmp1);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 5x5 output block.
+ *
+ * Optimized algorithm with 5 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/10).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp0 = (INT32) wsptr[2];
+    tmp1 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 4x4 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+
+GLOBAL(void)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[4*0] = (int) (tmp10 + tmp0);
+    wsptr[4*3] = (int) (tmp10 - tmp0);
+    wsptr[4*1] = (int) (tmp12 + tmp2);
+    wsptr[4*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x3 output block.
+ *
+ * Optimized algorithm with 2 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/6).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 3 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 2x2 output block.
+ *
+ * Multiplication-less algorithm.
+ */
+
+GLOBAL(void)
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Column 0 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+  /* Add fudge factor here for final descale. */
+  tmp4 += ONE << 2;
+
+  tmp0 = tmp4 + tmp5;
+  tmp2 = tmp4 - tmp5;
+
+  /* Column 1 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
+
+  tmp1 = tmp4 + tmp5;
+  tmp3 = tmp4 - tmp5;
+
+  /* Pass 2: process 2 rows, store into output array. */
+
+  /* Row 0 */
+  outptr = output_buf[0] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
+
+  /* Row 1 */
+  outptr = output_buf[1] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 1x1 output block.
+ *
+ * We hardly need an inverse DCT routine for this: just take the
+ * average pixel value, which is one-eighth of the DC coefficient.
+ */
+
+GLOBAL(void)
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  int dcval;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
+  dcval = (int) DESCALE((INT32) dcval, 3);
+
+  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 9x9 output block.
+ *
+ * Optimized algorithm with 10 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/18).
+ */
+
+GLOBAL(void)
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*9];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 9 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 9; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x10 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/20).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) (tmp22 + tmp12);
+    wsptr[8*7] = (int) (tmp22 - tmp12);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 11x11 output block.
+ *
+ * Optimized algorithm with 24 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/22).
+ */
+
+GLOBAL(void)
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*11];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 11 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 11; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x12 output block.
+ *
+ * Optimized algorithm with 15 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/24).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 13x13 output block.
+ *
+ * Optimized algorithm with 29 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/26).
+ */
+
+GLOBAL(void)
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*13];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 13 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 13; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x14 output block.
+ *
+ * Optimized algorithm with 20 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/28).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) (tmp23 + tmp13);
+    wsptr[8*10] = (int) (tmp23 - tmp13);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 15x15 output block.
+ *
+ * Optimized algorithm with 22 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/30).
+ */
+
+GLOBAL(void)
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*15];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 15 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 15; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z4 = (INT32) wsptr[5];
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = (INT32) wsptr[7];
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x16 output block.
+ *
+ * Optimized algorithm with 28 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 16 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+    
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+      
+      wsptr[DCTSIZE*0] = dcval;
+      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE*2] = dcval;
+      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE*4] = dcval;
+      wsptr[DCTSIZE*5] = dcval;
+      wsptr[DCTSIZE*6] = dcval;
+      wsptr[DCTSIZE*7] = dcval;
+      
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+    
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+    
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+    
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+    
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x7 output block.
+ *
+ * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp23 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
+    tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) (tmp21 + tmp11);
+    wsptr[8*4] = (int) (tmp21 - tmp11);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x5 output block.
+ *
+ * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) (tmp10 + tmp0);
+    wsptr[8*3] = (int) (tmp10 - tmp0);
+    wsptr[8*1] = (int) (tmp12 + tmp2);
+    wsptr[8*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x3 output block.
+ *
+ * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process 3 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[4*2];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+
+    /* Odd part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    /* Final output stage */
+
+    wsptr[4*0] = tmp10 + tmp0;
+    wsptr[4*1] = tmp10 - tmp0;
+  }
+
+  /* Pass 2: process 2 rows from work array, store into output array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = wsptr[0] + (ONE << 2);
+    tmp2 = wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = wsptr[1];
+    z3 = wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x1 output block.
+ *
+ * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: empty. */
+
+  /* Pass 2: process 1 row from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  outptr = output_buf[0] + output_col;
+
+  /* Even part */
+
+  tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
+
+  /* Final output stage */
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x16 output block.
+ *
+ * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+    
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+    
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
+    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+    
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+    
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x14 output block.
+ *
+ * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*3]  = (int) (tmp23 + tmp13);
+    wsptr[7*10] = (int) (tmp23 - tmp13);
+    wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp23 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 6x12 output block.
+ *
+ * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+    tmp12 = (INT32) wsptr[4];
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = tmp10 - tmp20 - tmp20;
+    tmp20 = (INT32) wsptr[2];
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 5x10 output block.
+ *
+ * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) (tmp22 + tmp12);
+    wsptr[5*7] = (int) (tmp22 - tmp12);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp13 = (INT32) wsptr[2];
+    tmp14 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 4; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+
+      wsptr[4*0] = dcval;
+      wsptr[4*1] = dcval;
+      wsptr[4*2] = dcval;
+      wsptr[4*3] = dcval;
+      wsptr[4*4] = dcval;
+      wsptr[4*5] = dcval;
+      wsptr[4*6] = dcval;
+      wsptr[4*7] = dcval;
+
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) (tmp11 + tmp1);
+    wsptr[3*4] = (int) (tmp11 - tmp1);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[2*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    wsptr[2*0] = tmp10 + tmp0;
+    wsptr[2*3] = tmp10 - tmp0;
+    wsptr[2*1] = tmp12 + tmp2;
+    wsptr[2*2] = tmp12 - tmp2;
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = wsptr[1];
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 2;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 1x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Process 1 column from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Even part */
+    
+  tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+  /* Final output stage */
+
+  output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
+					  & RANGE_MASK];
+  output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
+					  & RANGE_MASK];
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/jpeg/jidctred.c b/jpeg/jidctred.c
deleted file mode 100644
index 421f3c7ca..000000000
--- a/jpeg/jidctred.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * jidctred.c
- *
- * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains inverse-DCT routines that produce reduced-size output:
- * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
- *
- * The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M)
- * algorithm used in jidctint.c.  We simply replace each 8-to-8 1-D IDCT step
- * with an 8-to-4 step that produces the four averages of two adjacent outputs
- * (or an 8-to-2 step producing two averages of four outputs, for 2x2 output).
- * These steps were derived by computing the corresponding values at the end
- * of the normal LL&M code, then simplifying as much as possible.
- *
- * 1x1 is trivial: just take the DC coefficient divided by 8.
- *
- * See jidctint.c for additional comments.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
-
-#ifdef IDCT_SCALING_SUPPORTED
-
-
-/*
- * This module is specialized to the case DCTSIZE = 8.
- */
-
-#if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
-#endif
-
-
-/* Scaling is the same as in jidctint.c. */
-
-#if BITS_IN_JSAMPLE == 8
-#define CONST_BITS  13
-#define PASS1_BITS  2
-#else
-#define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
-#endif
-
-/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
- * causing a lot of useless floating-point operations at run time.
- * To get around this we use the following pre-calculated constants.
- * If you change CONST_BITS you may want to add appropriate values.
- * (With a reasonable C compiler, you can just rely on the FIX() macro...)
- */
-
-#if CONST_BITS == 13
-#define FIX_0_211164243  ((INT32)  1730)	/* FIX(0.211164243) */
-#define FIX_0_509795579  ((INT32)  4176)	/* FIX(0.509795579) */
-#define FIX_0_601344887  ((INT32)  4926)	/* FIX(0.601344887) */
-#define FIX_0_720959822  ((INT32)  5906)	/* FIX(0.720959822) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_850430095  ((INT32)  6967)	/* FIX(0.850430095) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_061594337  ((INT32)  8697)	/* FIX(1.061594337) */
-#define FIX_1_272758580  ((INT32)  10426)	/* FIX(1.272758580) */
-#define FIX_1_451774981  ((INT32)  11893)	/* FIX(1.451774981) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_2_172734803  ((INT32)  17799)	/* FIX(2.172734803) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_624509785  ((INT32)  29692)	/* FIX(3.624509785) */
-#else
-#define FIX_0_211164243  FIX(0.211164243)
-#define FIX_0_509795579  FIX(0.509795579)
-#define FIX_0_601344887  FIX(0.601344887)
-#define FIX_0_720959822  FIX(0.720959822)
-#define FIX_0_765366865  FIX(0.765366865)
-#define FIX_0_850430095  FIX(0.850430095)
-#define FIX_0_899976223  FIX(0.899976223)
-#define FIX_1_061594337  FIX(1.061594337)
-#define FIX_1_272758580  FIX(1.272758580)
-#define FIX_1_451774981  FIX(1.451774981)
-#define FIX_1_847759065  FIX(1.847759065)
-#define FIX_2_172734803  FIX(2.172734803)
-#define FIX_2_562915447  FIX(2.562915447)
-#define FIX_3_624509785  FIX(3.624509785)
-#endif
-
-
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
- * For 8-bit samples with the recommended scaling, all the variable
- * and constant values involved are no more than 16 bits wide, so a
- * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
- * For 12-bit samples, a full 32-bit multiplication will be needed.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
-#else
-#define MULTIPLY(var,const)  ((var) * (const))
-#endif
-
-
-/* Dequantize a coefficient by multiplying it by the multiplier-table
- * entry; produce an int result.  In this module, both inputs and result
- * are 16 bits or less, so either int or short multiply will work.
- */
-
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 4x4 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp2, tmp10, tmp12;
-  INT32 z1, z2, z3, z4;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*4];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-	inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      
-      continue;
-    }
-    
-    /* Even part */
-    
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= (CONST_BITS+1);
-    
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
-    
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-    
-    /* Odd part */
-    
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
-  }
-  
-  /* Pass 2: process 4 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 4; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-    
-    tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1);
-    
-    tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
-	 + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865);
-    
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-    
-    /* Odd part */
-    
-    z1 = (INT32) wsptr[7];
-    z2 = (INT32) wsptr[5];
-    z3 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[1];
-    
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 2x2 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp10, z1;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*2];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      
-      continue;
-    }
-    
-    /* Even part */
-    
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = z1 << (CONST_BITS+2);
-    
-    /* Odd part */
-
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
-  }
-  
-  /* Pass 2: process 2 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 2; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-    
-    tmp10 = ((INT32) wsptr[0]) << (CONST_BITS+2);
-    
-    /* Odd part */
-
-    tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-	 + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-	 + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-	 + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 1x1 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  int dcval;
-  ISLOW_MULT_TYPE * quantptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  SHIFT_TEMPS
-
-  /* We hardly need an inverse DCT routine for this: just take the
-   * average pixel value, which is one-eighth of the DC coefficient.
-   */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((INT32) dcval, 3);
-
-  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
-}
-
-#endif /* IDCT_SCALING_SUPPORTED */
diff --git a/jpeg/jmorecfg.h b/jpeg/jmorecfg.h
index 4f491fc91..928d052c8 100644
--- a/jpeg/jmorecfg.h
+++ b/jpeg/jmorecfg.h
@@ -2,6 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -54,9 +55,22 @@
  * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
+#ifdef HAVE_UNSIGNED_CHAR
+
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int) (value))
 
+#else /* not HAVE_UNSIGNED_CHAR */
+
+typedef char JSAMPLE;
+#ifdef CHAR_IS_UNSIGNED
+#define GETJSAMPLE(value)  ((int) (value))
+#else
+#define GETJSAMPLE(value)  ((int) (value) & 0xFF)
+#endif /* CHAR_IS_UNSIGNED */
+
+#endif /* HAVE_UNSIGNED_CHAR */
+
 #define MAXJSAMPLE	255
 #define CENTERJSAMPLE	128
 
@@ -92,9 +106,23 @@ typedef short JCOEF;
  * managers, this is also the data type passed to fread/fwrite.
  */
 
+#ifdef HAVE_UNSIGNED_CHAR
+
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
+#else /* not HAVE_UNSIGNED_CHAR */
+
+typedef char JOCTET;
+#ifdef CHAR_IS_UNSIGNED
+#define GETJOCTET(value)  (value)
+#else
+#define GETJOCTET(value)  ((value) & 0xFF)
+#endif /* CHAR_IS_UNSIGNED */
+
+#endif /* HAVE_UNSIGNED_CHAR */
+
+
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
  * won't cost a huge amount of memory, so we don't provide special
@@ -104,11 +132,23 @@ typedef unsigned char JOCTET;
 
 /* UINT8 must hold at least the values 0..255. */
 
+#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
+#else /* not HAVE_UNSIGNED_CHAR */
+#ifdef CHAR_IS_UNSIGNED
+typedef char UINT8;
+#else /* not CHAR_IS_UNSIGNED */
+typedef short UINT8;
+#endif /* CHAR_IS_UNSIGNED */
+#endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
 
+#ifdef HAVE_UNSIGNED_SHORT
 typedef unsigned short UINT16;
+#else /* not HAVE_UNSIGNED_SHORT */
+typedef unsigned int UINT16;
+#endif /* HAVE_UNSIGNED_SHORT */
 
 /* INT16 must hold at least the values -32768..32767. */
 
@@ -118,8 +158,14 @@ typedef short INT16;
 
 /* INT32 must hold at least signed 32-bit values. */
 
-#ifndef XMD_H
-typedef int INT32;
+#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H		/* MinGW is slightly different */
+#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
+typedef long INT32;
+#endif
+#endif
+#endif
 #endif
 
 /* Datatype used for image dimensions.  The JPEG standard only supports
@@ -157,7 +203,12 @@ typedef unsigned int JDIMENSION;
  * Again, you can customize this if you need special linkage keywords.
  */
 
+#ifdef HAVE_PROTOTYPES
 #define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#else
+#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
+#endif
+
 
 /* Here is the pseudo-keyword for declaring pointers that must be "far"
  * on 80x86 machines.  Most of the specialized coding for 80x86 is handled
@@ -165,8 +216,13 @@ typedef unsigned int JDIMENSION;
  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
  */
 
-#undef FAR
+#ifndef FAR
+#ifdef NEED_FAR_POINTERS
+#define FAR  far
+#else
 #define FAR
+#endif
+#endif
 
 
 /*
@@ -177,7 +233,7 @@ typedef unsigned int JDIMENSION;
  */
 
 #ifndef HAVE_BOOLEAN
-typedef char boolean;
+typedef int boolean;
 #endif
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
@@ -209,8 +265,6 @@ typedef char boolean;
  * (You may HAVE to do that if your compiler doesn't like null source files.)
  */
 
-/* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
-
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
@@ -219,9 +273,10 @@ typedef char boolean;
 
 /* Encoder capability options: */
 
-#undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define DCT_SCALING_SUPPORTED	    /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
  * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
@@ -235,12 +290,12 @@ typedef char boolean;
 
 /* Decoder capability options: */
 
-#undef  D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
diff --git a/jpeg/jpegint.h b/jpeg/jpegint.h
index 95b00d405..0c27a4e4a 100644
--- a/jpeg/jpegint.h
+++ b/jpeg/jpegint.h
@@ -2,6 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -99,14 +100,16 @@ struct jpeg_downsampler {
 };
 
 /* Forward DCT (also controls coefficient quantization) */
+typedef JMETHOD(void, forward_DCT_ptr,
+		(j_compress_ptr cinfo, jpeg_component_info * compptr,
+		 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		 JDIMENSION start_row, JDIMENSION start_col,
+		 JDIMENSION num_blocks));
+
 struct jpeg_forward_dct {
   JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  /* perhaps this should be an array??? */
-  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
-			      jpeg_component_info * compptr,
-			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-			      JDIMENSION start_row, JDIMENSION start_col,
-			      JDIMENSION num_blocks));
+  /* It is useful to allow each component to have a separate FDCT method. */
+  forward_DCT_ptr forward_DCT[MAX_COMPONENTS];
 };
 
 /* Entropy encoding */
@@ -210,10 +213,6 @@ struct jpeg_entropy_decoder {
   JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
   JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo,
 				JBLOCKROW *MCU_data));
-
-  /* This is here to share code between baseline and progressive decoders; */
-  /* other modules probably should not use it */
-  boolean insufficient_data;	/* set TRUE after emitting warning */
 };
 
 /* Inverse DCT (also performs dequantization) */
@@ -303,7 +302,7 @@ struct jpeg_color_quantizer {
 #define jinit_downsampler	jIDownsampler
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
-#define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@@ -312,7 +311,7 @@ struct jpeg_color_quantizer {
 #define jinit_input_controller	jIInCtlr
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
-#define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@@ -327,6 +326,13 @@ struct jpeg_color_quantizer {
 #define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_natural_order7	jZAGTable7
+#define jpeg_natural_order6	jZAGTable6
+#define jpeg_natural_order5	jZAGTable5
+#define jpeg_natural_order4	jZAGTable4
+#define jpeg_natural_order3	jZAGTable3
+#define jpeg_natural_order2	jZAGTable2
+#define jpeg_aritab		jAriTab
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 
@@ -344,7 +350,7 @@ EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -357,7 +363,7 @@ EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
 EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
@@ -381,6 +387,15 @@ EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
+extern const int jpeg_natural_order7[]; /* zz to natural order for 7x7 block */
+extern const int jpeg_natural_order6[]; /* zz to natural order for 6x6 block */
+extern const int jpeg_natural_order5[]; /* zz to natural order for 5x5 block */
+extern const int jpeg_natural_order4[]; /* zz to natural order for 4x4 block */
+extern const int jpeg_natural_order3[]; /* zz to natural order for 3x3 block */
+extern const int jpeg_natural_order2[]; /* zz to natural order for 2x2 block */
+
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];
 
 /* Suppress undefined-structure complaints if necessary. */
 
diff --git a/jpeg/jpeglib.h b/jpeg/jpeglib.h
index d1be8ddef..5039d4bf4 100644
--- a/jpeg/jpeglib.h
+++ b/jpeg/jpeglib.h
@@ -2,6 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -26,11 +27,17 @@
 #include "jmorecfg.h"		/* seldom changed options */
 
 
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
 /* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 80".
  */
 
-#define JPEG_LIB_VERSION  62	/* Version 6b */
+#define JPEG_LIB_VERSION  80	/* Version 8.0 */
 
 
 /* Various constants determining the sizes of things.
@@ -138,18 +145,18 @@ typedef struct {
    */
   JDIMENSION width_in_blocks;
   JDIMENSION height_in_blocks;
-  /* Size of a DCT block in samples.  Always DCTSIZE for compression.
-   * For decompression this is the size of the output from one DCT block,
-   * reflecting any scaling we choose to apply during the IDCT step.
-   * Values of 1,2,4,8 are likely to be supported.  Note that different
-   * components may receive different IDCT scalings.
+  /* Size of a DCT block in samples,
+   * reflecting any scaling we choose to apply during the DCT step.
+   * Values from 1 to 16 are supported.
+   * Note that different components may receive different DCT scalings.
    */
-  int DCT_scaled_size;
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
   /* The downsampled dimensions are the component's actual, unpadded number
-   * of samples at the main buffer (preprocessing/compression interface), thus
-   * downsampled_width = ceil(image_width * Hi/Hmax)
-   * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * of samples at the main buffer (preprocessing/compression interface);
+   * DCT scaling is included, so
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_h_scaled_size/DCTSIZE)
+   * and similarly for height.
    */
   JDIMENSION downsampled_width;	 /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
@@ -164,7 +171,7 @@ typedef struct {
   int MCU_width;		/* number of blocks per MCU, horizontally */
   int MCU_height;		/* number of blocks per MCU, vertically */
   int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_scaled_size */
+  int MCU_sample_width;	/* MCU width in samples: MCU_width * DCT_h_scaled_size */
   int last_col_width;		/* # of non-dummy blocks across in last MCU */
   int last_row_height;		/* # of non-dummy blocks down in last MCU */
 
@@ -291,6 +298,17 @@ struct jpeg_compress_struct {
    * helper routines to simplify changing parameters.
    */
 
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+
   int data_precision;		/* bits of precision in image data */
 
   int num_components;		/* # of color components in JPEG image */
@@ -298,14 +316,17 @@ struct jpeg_compress_struct {
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
-  
+
   JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+  int q_scale_factor[NUM_QUANT_TBLS];
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
   JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
   JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -321,6 +342,7 @@ struct jpeg_compress_struct {
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
   boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
   boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
   int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
   J_DCT_METHOD dct_method;	/* DCT algorithm selector */
 
@@ -364,6 +386,9 @@ struct jpeg_compress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -389,6 +414,10 @@ struct jpeg_compress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order;	/* natural-order position array */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+
   /*
    * Links to compression subobjects (methods and private variables of modules)
    */
@@ -535,6 +564,7 @@ struct jpeg_decompress_struct {
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
+  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
   boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
 
@@ -575,7 +605,8 @@ struct jpeg_decompress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
 
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
@@ -583,7 +614,7 @@ struct jpeg_decompress_struct {
    * in fully interleaved JPEG scans, but are used whether the scan is
    * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
    * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
    */
 
   JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -607,6 +638,12 @@ struct jpeg_decompress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+
   /* This field is shared between entropy decoder and marker parser.
    * It is either zero or the code of a JPEG marker that has been
    * read from the data source, but has not yet been processed.
@@ -836,11 +873,14 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_destroy_decompress	jDestDecompress
 #define jpeg_stdio_dest		jStdDest
 #define jpeg_stdio_src		jStdSrc
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
 #define jpeg_set_defaults	jSetDefaults
 #define jpeg_set_colorspace	jSetColorspace
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#define jpeg_default_qtables	jDefQTables
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@@ -850,6 +890,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@@ -866,6 +907,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_input_complete	jInComplete
 #define jpeg_new_colormap	jNewCMap
 #define jpeg_consume_input	jConsumeInput
+#define jpeg_core_output_dimensions	jCoreDimensions
 #define jpeg_calc_output_dimensions	jCalcDimensions
 #define jpeg_save_markers	jSaveMarkers
 #define jpeg_set_marker_processor	jSetMarker
@@ -910,6 +952,14 @@ EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
 EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));
 
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+			       unsigned char ** outbuffer,
+			       unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+			      unsigned char * inbuffer,
+			      unsigned long insize));
+
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
 /* Compression parameter setup aids */
@@ -921,6 +971,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
 					  int scale_factor,
 					  boolean force_baseline));
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+				       boolean force_baseline));
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
 				       const unsigned int *basic_table,
 				       int scale_factor,
@@ -940,12 +992,15 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
 					     JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
 
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
 					    JSAMPIMAGE data,
 					    JDIMENSION num_lines));
 
-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
 	JPP((j_compress_ptr cinfo, int marker,
 	     const JOCTET * dataptr, unsigned int datalen));
@@ -999,6 +1054,7 @@ EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
 #define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */
 
 /* Precalculate output dimensions for current decompression parameters. */
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));
 
 /* Control saving of COM and APPn markers into marker_list. */
@@ -1093,4 +1149,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif
 
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
 #endif /* JPEGLIB_H */
diff --git a/jpeg/jutils.c b/jpeg/jutils.c
index d18a95556..04351797c 100644
--- a/jpeg/jutils.c
+++ b/jpeg/jutils.c
@@ -2,6 +2,7 @@
  * jutils.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -63,6 +64,57 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
  63, 63, 63, 63, 63, 63, 63, 63
 };
 
+const int jpeg_natural_order7[7*7+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13,  6, 14, 21, 28, 35,
+ 42, 49, 50, 43, 36, 29, 22, 30,
+ 37, 44, 51, 52, 45, 38, 46, 53,
+ 54,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order6[6*6+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 41, 34, 27,
+ 20, 13, 21, 28, 35, 42, 43, 36,
+ 29, 37, 44, 45,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order5[5*5+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4, 12,
+ 19, 26, 33, 34, 27, 20, 28, 35,
+ 36,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order4[4*4+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 25, 18, 11, 19, 26, 27,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order3[3*3+16] = {
+  0,  1,  8, 16,  9,  2, 10, 17,
+ 18,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order2[2*2+16] = {
+  0,  1,  8,  9,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
 
 /*
  * Arithmetic utilities
diff --git a/jpeg/jversion.h b/jpeg/jversion.h
index 6472c58d3..70c8b6fe1 100644
--- a/jpeg/jversion.h
+++ b/jpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +9,6 @@
  */
 
 
-#define JVERSION	"6b  27-Mar-1998"
+#define JVERSION	"8b  16-May-2010"
 
-#define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+#define JCOPYRIGHT	"Copyright (C) 2010, Thomas G. Lane, Guido Vollbeding"
diff --git a/jpeg/libjpeg.doc b/jpeg/libjpeg.doc
deleted file mode 100644
index 689b206c0..000000000
--- a/jpeg/libjpeg.doc
+++ /dev/null
@@ -1,3006 +0,0 @@
-USING THE IJG JPEG LIBRARY
-
-Copyright (C) 1994-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-This file describes how to use the IJG JPEG library within an application
-program.  Read it if you want to write a program that uses the library.
-
-The file example.c provides heavily commented skeleton code for calling the
-JPEG library.  Also see jpeglib.h (the include file to be used by application
-programs) for full details about data structures and function parameter lists.
-The library source code, of course, is the ultimate reference.
-
-Note that there have been *major* changes from the application interface
-presented by IJG version 4 and earlier versions.  The old design had several
-inherent limitations, and it had accumulated a lot of cruft as we added
-features while trying to minimize application-interface changes.  We have
-sacrificed backward compatibility in the version 5 rewrite, but we think the
-improvements justify this.
-
-
-TABLE OF CONTENTS
------------------
-
-Overview:
-	Functions provided by the library
-	Outline of typical usage
-Basic library usage:
-	Data formats
-	Compression details
-	Decompression details
-	Mechanics of usage: include files, linking, etc
-Advanced features:
-	Compression parameter selection
-	Decompression parameter selection
-	Special color spaces
-	Error handling
-	Compressed data handling (source and destination managers)
-	I/O suspension
-	Progressive JPEG support
-	Buffered-image mode
-	Abbreviated datastreams and multiple images
-	Special markers
-	Raw (downsampled) image data
-	Really raw data: DCT coefficients
-	Progress monitoring
-	Memory management
-	Memory usage
-	Library compile-time options
-	Portability considerations
-	Notes for MS-DOS implementors
-
-You should read at least the overview and basic usage sections before trying
-to program with the library.  The sections on advanced features can be read
-if and when you need them.
-
-
-OVERVIEW
-========
-
-Functions provided by the library
----------------------------------
-
-The IJG JPEG library provides C code to read and write JPEG-compressed image
-files.  The surrounding application program receives or supplies image data a
-scanline at a time, using a straightforward uncompressed image format.  All
-details of color conversion and other preprocessing/postprocessing can be
-handled by the library.
-
-The library includes a substantial amount of code that is not covered by the
-JPEG standard but is necessary for typical applications of JPEG.  These
-functions preprocess the image before JPEG compression or postprocess it after
-decompression.  They include colorspace conversion, downsampling/upsampling,
-and color quantization.  The application indirectly selects use of this code
-by specifying the format in which it wishes to supply or receive image data.
-For example, if colormapped output is requested, then the decompression
-library automatically invokes color quantization.
-
-A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
-and even more so in decompression postprocessing.  The decompression library
-provides multiple implementations that cover most of the useful tradeoffs,
-ranging from very-high-quality down to fast-preview operation.  On the
-compression side we have generally not provided low-quality choices, since
-compression is normally less time-critical.  It should be understood that the
-low-quality modes may not meet the JPEG standard's accuracy requirements;
-nonetheless, they are useful for viewers.
-
-A word about functions *not* provided by the library.  We handle a subset of
-the ISO JPEG standard; most baseline, extended-sequential, and progressive
-JPEG processes are supported.  (Our subset includes all features now in common
-use.)  Unsupported ISO options include:
-	* Hierarchical storage
-	* Lossless JPEG
-	* Arithmetic entropy coding (unsupported for legal reasons)
-	* DNL marker
-	* Nonintegral subsampling ratios
-We support both 8- and 12-bit data precision, but this is a compile-time
-choice rather than a run-time choice; hence it is difficult to use both
-precisions in a single application.
-
-By itself, the library handles only interchange JPEG datastreams --- in
-particular the widely used JFIF file format.  The library can be used by
-surrounding code to process interchange or abbreviated JPEG datastreams that
-are embedded in more complex file formats.  (For example, this library is
-used by the free LIBTIFF library to support JPEG compression in TIFF.)
-
-
-Outline of typical usage
-------------------------
-
-The rough outline of a JPEG compression operation is:
-
-	Allocate and initialize a JPEG compression object
-	Specify the destination for the compressed data (eg, a file)
-	Set parameters for compression, including image size & colorspace
-	jpeg_start_compress(...);
-	while (scan lines remain to be written)
-		jpeg_write_scanlines(...);
-	jpeg_finish_compress(...);
-	Release the JPEG compression object
-
-A JPEG compression object holds parameters and working state for the JPEG
-library.  We make creation/destruction of the object separate from starting
-or finishing compression of an image; the same object can be re-used for a
-series of image compression operations.  This makes it easy to re-use the
-same parameter settings for a sequence of images.  Re-use of a JPEG object
-also has important implications for processing abbreviated JPEG datastreams,
-as discussed later.
-
-The image data to be compressed is supplied to jpeg_write_scanlines() from
-in-memory buffers.  If the application is doing file-to-file compression,
-reading image data from the source file is the application's responsibility.
-The library emits compressed data by calling a "data destination manager",
-which typically will write the data into a file; but the application can
-provide its own destination manager to do something else.
-
-Similarly, the rough outline of a JPEG decompression operation is:
-
-	Allocate and initialize a JPEG decompression object
-	Specify the source of the compressed data (eg, a file)
-	Call jpeg_read_header() to obtain image info
-	Set parameters for decompression
-	jpeg_start_decompress(...);
-	while (scan lines remain to be read)
-		jpeg_read_scanlines(...);
-	jpeg_finish_decompress(...);
-	Release the JPEG decompression object
-
-This is comparable to the compression outline except that reading the
-datastream header is a separate step.  This is helpful because information
-about the image's size, colorspace, etc is available when the application
-selects decompression parameters.  For example, the application can choose an
-output scaling ratio that will fit the image into the available screen size.
-
-The decompression library obtains compressed data by calling a data source
-manager, which typically will read the data from a file; but other behaviors
-can be obtained with a custom source manager.  Decompressed data is delivered
-into in-memory buffers passed to jpeg_read_scanlines().
-
-It is possible to abort an incomplete compression or decompression operation
-by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
-simply release it by calling jpeg_destroy().
-
-JPEG compression and decompression objects are two separate struct types.
-However, they share some common fields, and certain routines such as
-jpeg_destroy() can work on either type of object.
-
-The JPEG library has no static variables: all state is in the compression
-or decompression object.  Therefore it is possible to process multiple
-compression and decompression operations concurrently, using multiple JPEG
-objects.
-
-Both compression and decompression can be done in an incremental memory-to-
-memory fashion, if suitable source/destination managers are used.  See the
-section on "I/O suspension" for more details.
-
-
-BASIC LIBRARY USAGE
-===================
-
-Data formats
-------------
-
-Before diving into procedural details, it is helpful to understand the
-image data format that the JPEG library expects or returns.
-
-The standard input image format is a rectangular array of pixels, with each
-pixel having the same number of "component" or "sample" values (color
-channels).  You must specify how many components there are and the colorspace
-interpretation of the components.  Most applications will use RGB data
-(three components per pixel) or grayscale data (one component per pixel).
-PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
-A remarkable number of people manage to miss this, only to find that their
-programs don't work with grayscale JPEG files.
-
-There is no provision for colormapped input.  JPEG files are always full-color
-or full grayscale (or sometimes another colorspace such as CMYK).  You can
-feed in a colormapped image by expanding it to full-color format.  However
-JPEG often doesn't work very well with source data that has been colormapped,
-because of dithering noise.  This is discussed in more detail in the JPEG FAQ
-and the other references mentioned in the README file.
-
-Pixels are stored by scanlines, with each scanline running from left to
-right.  The component values for each pixel are adjacent in the row; for
-example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
-array of data type JSAMPLE --- which is typically "unsigned char", unless
-you've changed jmorecfg.h.  (You can also change the RGB pixel layout, say
-to B,G,R order, by modifying jmorecfg.h.  But see the restrictions listed in
-that file before doing so.)
-
-A 2-D array of pixels is formed by making a list of pointers to the starts of
-scanlines; so the scanlines need not be physically adjacent in memory.  Even
-if you process just one scanline at a time, you must make a one-element
-pointer array to conform to this structure.  Pointers to JSAMPLE rows are of
-type JSAMPROW, and the pointer to the pointer array is of type JSAMPARRAY.
-
-The library accepts or supplies one or more complete scanlines per call.
-It is not possible to process part of a row at a time.  Scanlines are always
-processed top-to-bottom.  You can process an entire image in one call if you
-have it all in memory, but usually it's simplest to process one scanline at
-a time.
-
-For best results, source data values should have the precision specified by
-BITS_IN_JSAMPLE (normally 8 bits).  For instance, if you choose to compress
-data that's only 6 bits/channel, you should left-justify each value in a
-byte before passing it to the compressor.  If you need to compress data
-that has more than 8 bits/channel, compile with BITS_IN_JSAMPLE = 12.
-(See "Library compile-time options", later.)
-
-
-The data format returned by the decompressor is the same in all details,
-except that colormapped output is supported.  (Again, a JPEG file is never
-colormapped.  But you can ask the decompressor to perform on-the-fly color
-quantization to deliver colormapped output.)  If you request colormapped
-output then the returned data array contains a single JSAMPLE per pixel;
-its value is an index into a color map.  The color map is represented as
-a 2-D JSAMPARRAY in which each row holds the values of one color component,
-that is, colormap[i][j] is the value of the i'th color component for pixel
-value (map index) j.  Note that since the colormap indexes are stored in
-JSAMPLEs, the maximum number of colors is limited by the size of JSAMPLE
-(ie, at most 256 colors for an 8-bit JPEG library).
-
-
-Compression details
--------------------
-
-Here we revisit the JPEG compression outline given in the overview.
-
-1. Allocate and initialize a JPEG compression object.
-
-A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
-a bunch of subsidiary structures which are allocated via malloc(), but the
-application doesn't control those directly.)  This struct can be just a local
-variable in the calling routine, if a single routine is going to execute the
-whole JPEG compression sequence.  Otherwise it can be static or allocated
-from malloc().
-
-You will also need a structure representing a JPEG error handler.  The part
-of this that the library cares about is a "struct jpeg_error_mgr".  If you
-are providing your own error handler, you'll typically want to embed the
-jpeg_error_mgr struct in a larger structure; this is discussed later under
-"Error handling".  For now we'll assume you are just using the default error
-handler.  The default error handler will print JPEG error/warning messages
-on stderr, and it will call exit() if a fatal error occurs.
-
-You must initialize the error handler structure, store a pointer to it into
-the JPEG object's "err" field, and then call jpeg_create_compress() to
-initialize the rest of the JPEG object.
-
-Typical code for this step, if you are using the default error handler, is
-
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_error_mgr jerr;
-	...
-	cinfo.err = jpeg_std_error(&jerr);
-	jpeg_create_compress(&cinfo);
-
-jpeg_create_compress allocates a small amount of memory, so it could fail
-if you are out of memory.  In that case it will exit via the error handler;
-that's why the error handler must be initialized first.
-
-
-2. Specify the destination for the compressed data (eg, a file).
-
-As previously mentioned, the JPEG library delivers compressed data to a
-"data destination" module.  The library includes one data destination
-module which knows how to write to a stdio stream.  You can use your own
-destination module if you want to do something else, as discussed later.
-
-If you use the standard destination module, you must open the target stdio
-stream beforehand.  Typical code for this step looks like:
-
-	FILE * outfile;
-	...
-	if ((outfile = fopen(filename, "wb")) == NULL) {
-	    fprintf(stderr, "can't open %s\n", filename);
-	    exit(1);
-	}
-	jpeg_stdio_dest(&cinfo, outfile);
-
-where the last line invokes the standard destination module.
-
-WARNING: it is critical that the binary compressed data be delivered to the
-output file unchanged.  On non-Unix systems the stdio library may perform
-newline translation or otherwise corrupt binary data.  To suppress this
-behavior, you may need to use a "b" option to fopen (as shown above), or use
-setmode() or another routine to put the stdio stream in binary mode.  See
-cjpeg.c and djpeg.c for code that has been found to work on many systems.
-
-You can select the data destination after setting other parameters (step 3),
-if that's more convenient.  You may not change the destination between
-calling jpeg_start_compress() and jpeg_finish_compress().
-
-
-3. Set parameters for compression, including image size & colorspace.
-
-You must supply information about the source image by setting the following
-fields in the JPEG object (cinfo structure):
-
-	image_width		Width of image, in pixels
-	image_height		Height of image, in pixels
-	input_components	Number of color channels (samples per pixel)
-	in_color_space		Color space of source image
-
-The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
-of 1 to 64K pixels in either direction.  The input color space is typically
-RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
-color spaces", later, for more info.)  The in_color_space field must be
-assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
-JCS_GRAYSCALE.
-
-JPEG has a large number of compression parameters that determine how the
-image is encoded.  Most applications don't need or want to know about all
-these parameters.  You can set all the parameters to reasonable defaults by
-calling jpeg_set_defaults(); then, if there are particular values you want
-to change, you can do so after that.  The "Compression parameter selection"
-section tells about all the parameters.
-
-You must set in_color_space correctly before calling jpeg_set_defaults(),
-because the defaults depend on the source image colorspace.  However the
-other three source image parameters need not be valid until you call
-jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
-than once, if that happens to be convenient.
-
-Typical code for a 24-bit RGB source image is
-
-	cinfo.image_width = Width; 	/* image width and height, in pixels */
-	cinfo.image_height = Height;
-	cinfo.input_components = 3;	/* # of color components per pixel */
-	cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
-
-	jpeg_set_defaults(&cinfo);
-	/* Make optional parameter settings here */
-
-
-4. jpeg_start_compress(...);
-
-After you have established the data destination and set all the necessary
-source image info and other parameters, call jpeg_start_compress() to begin
-a compression cycle.  This will initialize internal state, allocate working
-storage, and emit the first few bytes of the JPEG datastream header.
-
-Typical code:
-
-	jpeg_start_compress(&cinfo, TRUE);
-
-The "TRUE" parameter ensures that a complete JPEG interchange datastream
-will be written.  This is appropriate in most cases.  If you think you might
-want to use an abbreviated datastream, read the section on abbreviated
-datastreams, below.
-
-Once you have called jpeg_start_compress(), you may not alter any JPEG
-parameters or other fields of the JPEG object until you have completed
-the compression cycle.
-
-
-5. while (scan lines remain to be written)
-	jpeg_write_scanlines(...);
-
-Now write all the required image data by calling jpeg_write_scanlines()
-one or more times.  You can pass one or more scanlines in each call, up
-to the total image height.  In most applications it is convenient to pass
-just one or a few scanlines at a time.  The expected format for the passed
-data is discussed under "Data formats", above.
-
-Image data should be written in top-to-bottom scanline order.  The JPEG spec
-contains some weasel wording about how top and bottom are application-defined
-terms (a curious interpretation of the English language...) but if you want
-your files to be compatible with everyone else's, you WILL use top-to-bottom
-order.  If the source data must be read in bottom-to-top order, you can use
-the JPEG library's virtual array mechanism to invert the data efficiently.
-Examples of this can be found in the sample application cjpeg.
-
-The library maintains a count of the number of scanlines written so far
-in the next_scanline field of the JPEG object.  Usually you can just use
-this variable as the loop counter, so that the loop test looks like
-"while (cinfo.next_scanline < cinfo.image_height)".
-
-Code for this step depends heavily on the way that you store the source data.
-example.c shows the following code for the case of a full-size 2-D source
-array containing 3-byte RGB pixels:
-
-	JSAMPROW row_pointer[1];	/* pointer to a single row */
-	int row_stride;			/* physical row width in buffer */
-
-	row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
-
-	while (cinfo.next_scanline < cinfo.image_height) {
-	    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
-	    jpeg_write_scanlines(&cinfo, row_pointer, 1);
-	}
-
-jpeg_write_scanlines() returns the number of scanlines actually written.
-This will normally be equal to the number passed in, so you can usually
-ignore the return value.  It is different in just two cases:
-  * If you try to write more scanlines than the declared image height,
-    the additional scanlines are ignored.
-  * If you use a suspending data destination manager, output buffer overrun
-    will cause the compressor to return before accepting all the passed lines.
-    This feature is discussed under "I/O suspension", below.  The normal
-    stdio destination manager will NOT cause this to happen.
-In any case, the return value is the same as the change in the value of
-next_scanline.
-
-
-6. jpeg_finish_compress(...);
-
-After all the image data has been written, call jpeg_finish_compress() to
-complete the compression cycle.  This step is ESSENTIAL to ensure that the
-last bufferload of data is written to the data destination.
-jpeg_finish_compress() also releases working memory associated with the JPEG
-object.
-
-Typical code:
-
-	jpeg_finish_compress(&cinfo);
-
-If using the stdio destination manager, don't forget to close the output
-stdio stream (if necessary) afterwards.
-
-If you have requested a multi-pass operating mode, such as Huffman code
-optimization, jpeg_finish_compress() will perform the additional passes using
-data buffered by the first pass.  In this case jpeg_finish_compress() may take
-quite a while to complete.  With the default compression parameters, this will
-not happen.
-
-It is an error to call jpeg_finish_compress() before writing the necessary
-total number of scanlines.  If you wish to abort compression, call
-jpeg_abort() as discussed below.
-
-After completing a compression cycle, you may dispose of the JPEG object
-as discussed next, or you may use it to compress another image.  In that case
-return to step 2, 3, or 4 as appropriate.  If you do not change the
-destination manager, the new datastream will be written to the same target.
-If you do not change any JPEG parameters, the new datastream will be written
-with the same parameters as before.  Note that you can change the input image
-dimensions freely between cycles, but if you change the input colorspace, you
-should call jpeg_set_defaults() to adjust for the new colorspace; and then
-you'll need to repeat all of step 3.
-
-
-7. Release the JPEG compression object.
-
-When you are done with a JPEG compression object, destroy it by calling
-jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
-the previous state of the object).  Or you can call jpeg_destroy(), which
-works for either compression or decompression objects --- this may be more
-convenient if you are sharing code between compression and decompression
-cases.  (Actually, these routines are equivalent except for the declared type
-of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
-should be passed a j_common_ptr.)
-
-If you allocated the jpeg_compress_struct structure from malloc(), freeing
-it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
-handler structure.
-
-Typical code:
-
-	jpeg_destroy_compress(&cinfo);
-
-
-8. Aborting.
-
-If you decide to abort a compression cycle before finishing, you can clean up
-in either of two ways:
-
-* If you don't need the JPEG object any more, just call
-  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
-  legitimate at any point after calling jpeg_create_compress() --- in fact,
-  it's safe even if jpeg_create_compress() fails.
-
-* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
-  jpeg_abort() which works on both compression and decompression objects.
-  This will return the object to an idle state, releasing any working memory.
-  jpeg_abort() is allowed at any time after successful object creation.
-
-Note that cleaning up the data destination, if required, is your
-responsibility; neither of these routines will call term_destination().
-(See "Compressed data handling", below, for more about that.)
-
-jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
-object that has reported an error by calling error_exit (see "Error handling"
-for more info).  The internal state of such an object is likely to be out of
-whack.  Either of these two routines will return the object to a known state.
-
-
-Decompression details
----------------------
-
-Here we revisit the JPEG decompression outline given in the overview.
-
-1. Allocate and initialize a JPEG decompression object.
-
-This is just like initialization for compression, as discussed above,
-except that the object is a "struct jpeg_decompress_struct" and you
-call jpeg_create_decompress().  Error handling is exactly the same.
-
-Typical code:
-
-	struct jpeg_decompress_struct cinfo;
-	struct jpeg_error_mgr jerr;
-	...
-	cinfo.err = jpeg_std_error(&jerr);
-	jpeg_create_decompress(&cinfo);
-
-(Both here and in the IJG code, we usually use variable name "cinfo" for
-both compression and decompression objects.)
-
-
-2. Specify the source of the compressed data (eg, a file).
-
-As previously mentioned, the JPEG library reads compressed data from a "data
-source" module.  The library includes one data source module which knows how
-to read from a stdio stream.  You can use your own source module if you want
-to do something else, as discussed later.
-
-If you use the standard source module, you must open the source stdio stream
-beforehand.  Typical code for this step looks like:
-
-	FILE * infile;
-	...
-	if ((infile = fopen(filename, "rb")) == NULL) {
-	    fprintf(stderr, "can't open %s\n", filename);
-	    exit(1);
-	}
-	jpeg_stdio_src(&cinfo, infile);
-
-where the last line invokes the standard source module.
-
-WARNING: it is critical that the binary compressed data be read unchanged.
-On non-Unix systems the stdio library may perform newline translation or
-otherwise corrupt binary data.  To suppress this behavior, you may need to use
-a "b" option to fopen (as shown above), or use setmode() or another routine to
-put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
-has been found to work on many systems.
-
-You may not change the data source between calling jpeg_read_header() and
-jpeg_finish_decompress().  If you wish to read a series of JPEG images from
-a single source file, you should repeat the jpeg_read_header() to
-jpeg_finish_decompress() sequence without reinitializing either the JPEG
-object or the data source module; this prevents buffered input data from
-being discarded.
-
-
-3. Call jpeg_read_header() to obtain image info.
-
-Typical code for this step is just
-
-	jpeg_read_header(&cinfo, TRUE);
-
-This will read the source datastream header markers, up to the beginning
-of the compressed data proper.  On return, the image dimensions and other
-info have been stored in the JPEG object.  The application may wish to
-consult this information before selecting decompression parameters.
-
-More complex code is necessary if
-  * A suspending data source is used --- in that case jpeg_read_header()
-    may return before it has read all the header data.  See "I/O suspension",
-    below.  The normal stdio source manager will NOT cause this to happen.
-  * Abbreviated JPEG files are to be processed --- see the section on
-    abbreviated datastreams.  Standard applications that deal only in
-    interchange JPEG files need not be concerned with this case either.
-
-It is permissible to stop at this point if you just wanted to find out the
-image dimensions and other header info for a JPEG file.  In that case,
-call jpeg_destroy() when you are done with the JPEG object, or call
-jpeg_abort() to return it to an idle state before selecting a new data
-source and reading another header.
-
-
-4. Set parameters for decompression.
-
-jpeg_read_header() sets appropriate default decompression parameters based on
-the properties of the image (in particular, its colorspace).  However, you
-may well want to alter these defaults before beginning the decompression.
-For example, the default is to produce full color output from a color file.
-If you want colormapped output you must ask for it.  Other options allow the
-returned image to be scaled and allow various speed/quality tradeoffs to be
-selected.  "Decompression parameter selection", below, gives details.
-
-If the defaults are appropriate, nothing need be done at this step.
-
-Note that all default values are set by each call to jpeg_read_header().
-If you reuse a decompression object, you cannot expect your parameter
-settings to be preserved across cycles, as you can for compression.
-You must set desired parameter values each time.
-
-
-5. jpeg_start_decompress(...);
-
-Once the parameter values are satisfactory, call jpeg_start_decompress() to
-begin decompression.  This will initialize internal state, allocate working
-memory, and prepare for returning data.
-
-Typical code is just
-
-	jpeg_start_decompress(&cinfo);
-
-If you have requested a multi-pass operating mode, such as 2-pass color
-quantization, jpeg_start_decompress() will do everything needed before data
-output can begin.  In this case jpeg_start_decompress() may take quite a while
-to complete.  With a single-scan (non progressive) JPEG file and default
-decompression parameters, this will not happen; jpeg_start_decompress() will
-return quickly.
-
-After this call, the final output image dimensions, including any requested
-scaling, are available in the JPEG object; so is the selected colormap, if
-colormapped output has been requested.  Useful fields include
-
-	output_width		image width and height, as scaled
-	output_height
-	out_color_components	# of color components in out_color_space
-	output_components	# of color components returned per pixel
-	colormap		the selected colormap, if any
-	actual_number_of_colors		number of entries in colormap
-
-output_components is 1 (a colormap index) when quantizing colors; otherwise it
-equals out_color_components.  It is the number of JSAMPLE values that will be
-emitted per pixel in the output arrays.
-
-Typically you will need to allocate data buffers to hold the incoming image.
-You will need output_width * output_components JSAMPLEs per scanline in your
-output buffer, and a total of output_height scanlines will be returned.
-
-Note: if you are using the JPEG library's internal memory manager to allocate
-data buffers (as djpeg does), then the manager's protocol requires that you
-request large buffers *before* calling jpeg_start_decompress().  This is a
-little tricky since the output_XXX fields are not normally valid then.  You
-can make them valid by calling jpeg_calc_output_dimensions() after setting the
-relevant parameters (scaling, output color space, and quantization flag).
-
-
-6. while (scan lines remain to be read)
-	jpeg_read_scanlines(...);
-
-Now you can read the decompressed image data by calling jpeg_read_scanlines()
-one or more times.  At each call, you pass in the maximum number of scanlines
-to be read (ie, the height of your working buffer); jpeg_read_scanlines()
-will return up to that many lines.  The return value is the number of lines
-actually read.  The format of the returned data is discussed under "Data
-formats", above.  Don't forget that grayscale and color JPEGs will return
-different data formats!
-
-Image data is returned in top-to-bottom scanline order.  If you must write
-out the image in bottom-to-top order, you can use the JPEG library's virtual
-array mechanism to invert the data efficiently.  Examples of this can be
-found in the sample application djpeg.
-
-The library maintains a count of the number of scanlines returned so far
-in the output_scanline field of the JPEG object.  Usually you can just use
-this variable as the loop counter, so that the loop test looks like
-"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
-should NOT be against image_height, unless you never use scaling.  The
-image_height field is the height of the original unscaled image.)
-The return value always equals the change in the value of output_scanline.
-
-If you don't use a suspending data source, it is safe to assume that
-jpeg_read_scanlines() reads at least one scanline per call, until the
-bottom of the image has been reached.
-
-If you use a buffer larger than one scanline, it is NOT safe to assume that
-jpeg_read_scanlines() fills it.  (The current implementation returns only a
-few scanlines per call, no matter how large a buffer you pass.)  So you must
-always provide a loop that calls jpeg_read_scanlines() repeatedly until the
-whole image has been read.
-
-
-7. jpeg_finish_decompress(...);
-
-After all the image data has been read, call jpeg_finish_decompress() to
-complete the decompression cycle.  This causes working memory associated
-with the JPEG object to be released.
-
-Typical code:
-
-	jpeg_finish_decompress(&cinfo);
-
-If using the stdio source manager, don't forget to close the source stdio
-stream if necessary.
-
-It is an error to call jpeg_finish_decompress() before reading the correct
-total number of scanlines.  If you wish to abort decompression, call
-jpeg_abort() as discussed below.
-
-After completing a decompression cycle, you may dispose of the JPEG object as
-discussed next, or you may use it to decompress another image.  In that case
-return to step 2 or 3 as appropriate.  If you do not change the source
-manager, the next image will be read from the same source.
-
-
-8. Release the JPEG decompression object.
-
-When you are done with a JPEG decompression object, destroy it by calling
-jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
-destroying compression objects applies here too.
-
-Typical code:
-
-	jpeg_destroy_decompress(&cinfo);
-
-
-9. Aborting.
-
-You can abort a decompression cycle by calling jpeg_destroy_decompress() or
-jpeg_destroy() if you don't need the JPEG object any more, or
-jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
-The previous discussion of aborting compression cycles applies here too.
-
-
-Mechanics of usage: include files, linking, etc
------------------------------------------------
-
-Applications using the JPEG library should include the header file jpeglib.h
-to obtain declarations of data types and routines.  Before including
-jpeglib.h, include system headers that define at least the typedefs FILE and
-size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
-older Unix systems, you may need <sys/types.h> to define size_t.
-
-If the application needs to refer to individual JPEG library error codes, also
-include jerror.h to define those symbols.
-
-jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
-installing the JPEG header files in a system directory, you will want to
-install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
-
-The most convenient way to include the JPEG code into your executable program
-is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
-machines) and reference it at your link step.  If you use only half of the
-library (only compression or only decompression), only that much code will be
-included from the library, unless your linker is hopelessly brain-damaged.
-The supplied makefiles build libjpeg.a automatically (see install.doc).
-
-While you can build the JPEG library as a shared library if the whim strikes
-you, we don't really recommend it.  The trouble with shared libraries is that
-at some point you'll probably try to substitute a new version of the library
-without recompiling the calling applications.  That generally doesn't work
-because the parameter struct declarations usually change with each new
-version.  In other words, the library's API is *not* guaranteed binary
-compatible across versions; we only try to ensure source-code compatibility.
-(In hindsight, it might have been smarter to hide the parameter structs from
-applications and introduce a ton of access functions instead.  Too late now,
-however.)
-
-On some systems your application may need to set up a signal handler to ensure
-that temporary files are deleted if the program is interrupted.  This is most
-critical if you are on MS-DOS and use the jmemdos.c memory manager back end;
-it will try to grab extended memory for temp files, and that space will NOT be
-freed automatically.  See cjpeg.c or djpeg.c for an example signal handler.
-
-It may be worth pointing out that the core JPEG library does not actually
-require the stdio library: only the default source/destination managers and
-error handler need it.  You can use the library in a stdio-less environment
-if you replace those modules and use jmemnobs.c (or another memory manager of
-your own devising).  More info about the minimum system library requirements
-may be found in jinclude.h.
-
-
-ADVANCED FEATURES
-=================
-
-Compression parameter selection
--------------------------------
-
-This section describes all the optional parameters you can set for JPEG
-compression, as well as the "helper" routines provided to assist in this
-task.  Proper setting of some parameters requires detailed understanding
-of the JPEG standard; if you don't know what a parameter is for, it's best
-not to mess with it!  See REFERENCES in the README file for pointers to
-more info about JPEG.
-
-It's a good idea to call jpeg_set_defaults() first, even if you plan to set
-all the parameters; that way your code is more likely to work with future JPEG
-libraries that have additional parameters.  For the same reason, we recommend
-you use a helper routine where one is provided, in preference to twiddling
-cinfo fields directly.
-
-The helper routines are:
-
-jpeg_set_defaults (j_compress_ptr cinfo)
-	This routine sets all JPEG parameters to reasonable defaults, using
-	only the input image's color space (field in_color_space, which must
-	already be set in cinfo).  Many applications will only need to use
-	this routine and perhaps jpeg_set_quality().
-
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
-	Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
-	and sets other color-space-dependent parameters appropriately.  See
-	"Special color spaces", below, before using this.  A large number of
-	parameters, including all per-component parameters, are set by this
-	routine; if you want to twiddle individual parameters you should call
-	jpeg_set_colorspace() before rather than after.
-
-jpeg_default_colorspace (j_compress_ptr cinfo)
-	Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
-	and calls jpeg_set_colorspace().  This is actually a subroutine of
-	jpeg_set_defaults().  It's broken out in case you want to change
-	just the colorspace-dependent JPEG parameters.
-
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
-	Constructs JPEG quantization tables appropriate for the indicated
-	quality setting.  The quality value is expressed on the 0..100 scale
-	recommended by IJG (cjpeg's "-quality" switch uses this routine).
-	Note that the exact mapping from quality values to tables may change
-	in future IJG releases as more is learned about DCT quantization.
-	If the force_baseline parameter is TRUE, then the quantization table
-	entries are constrained to the range 1..255 for full JPEG baseline
-	compatibility.  In the current implementation, this only makes a
-	difference for quality settings below 25, and it effectively prevents
-	very small/low quality files from being generated.  The IJG decoder
-	is capable of reading the non-baseline files generated at low quality
-	settings when force_baseline is FALSE, but other decoders may not be.
-
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-			 boolean force_baseline)
-	Same as jpeg_set_quality() except that the generated tables are the
-	sample tables given in the JPEC spec section K.1, multiplied by the
-	specified scale factor (which is expressed as a percentage; thus
-	scale_factor = 100 reproduces the spec's tables).  Note that larger
-	scale factors give lower quality.  This entry point is useful for
-	conforming to the Adobe PostScript DCT conventions, but we do not
-	recommend linear scaling as a user-visible quality scale otherwise.
-	force_baseline again constrains the computed table entries to 1..255.
-
-int jpeg_quality_scaling (int quality)
-	Converts a value on the IJG-recommended quality scale to a linear
-	scaling percentage.  Note that this routine may change or go away
-	in future releases --- IJG may choose to adopt a scaling method that
-	can't be expressed as a simple scalar multiplier, in which case the
-	premise of this routine collapses.  Caveat user.
-
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-		      const unsigned int *basic_table,
-		      int scale_factor, boolean force_baseline)
-	Allows an arbitrary quantization table to be created.  which_tbl
-	indicates which table slot to fill.  basic_table points to an array
-	of 64 unsigned ints given in normal array order.  These values are
-	multiplied by scale_factor/100 and then clamped to the range 1..65535
-	(or to 1..255 if force_baseline is TRUE).
-	CAUTION: prior to library version 6a, jpeg_add_quant_table expected
-	the basic table to be given in JPEG zigzag order.  If you need to
-	write code that works with either older or newer versions of this
-	routine, you must check the library version number.  Something like
-	"#if JPEG_LIB_VERSION >= 61" is the right test.
-
-jpeg_simple_progression (j_compress_ptr cinfo)
-	Generates a default scan script for writing a progressive-JPEG file.
-	This is the recommended method of creating a progressive file,
-	unless you want to make a custom scan sequence.  You must ensure that
-	the JPEG color space is set correctly before calling this routine.
-
-
-Compression parameters (cinfo fields) include:
-
-J_DCT_METHOD dct_method
-	Selects the algorithm used for the DCT step.  Choices are:
-		JDCT_ISLOW: slow but accurate integer algorithm
-		JDCT_IFAST: faster, less accurate integer method
-		JDCT_FLOAT: floating-point method
-		JDCT_DEFAULT: default method (normally JDCT_ISLOW)
-		JDCT_FASTEST: fastest method (normally JDCT_IFAST)
-	The FLOAT method is very slightly more accurate than the ISLOW method,
-	but may give different results on different machines due to varying
-	roundoff behavior.  The integer methods should give the same results
-	on all machines.  On machines with sufficiently fast FP hardware, the
-	floating-point method may also be the fastest.  The IFAST method is
-	considerably less accurate than the other two; its use is not
-	recommended if high quality is a concern.  JDCT_DEFAULT and
-	JDCT_FASTEST are macros configurable by each installation.
-
-J_COLOR_SPACE jpeg_color_space
-int num_components
-	The JPEG color space and corresponding number of components; see
-	"Special color spaces", below, for more info.  We recommend using
-	jpeg_set_color_space() if you want to change these.
-
-boolean optimize_coding
-	TRUE causes the compressor to compute optimal Huffman coding tables
-	for the image.  This requires an extra pass over the data and
-	therefore costs a good deal of space and time.  The default is
-	FALSE, which tells the compressor to use the supplied or default
-	Huffman tables.  In most cases optimal tables save only a few percent
-	of file size compared to the default tables.  Note that when this is
-	TRUE, you need not supply Huffman tables at all, and any you do
-	supply will be overwritten.
-
-unsigned int restart_interval
-int restart_in_rows
-	To emit restart markers in the JPEG file, set one of these nonzero.
-	Set restart_interval to specify the exact interval in MCU blocks.
-	Set restart_in_rows to specify the interval in MCU rows.  (If
-	restart_in_rows is not 0, then restart_interval is set after the
-	image width in MCUs is computed.)  Defaults are zero (no restarts).
-	One restart marker per MCU row is often a good choice.
-	NOTE: the overhead of restart markers is higher in grayscale JPEG
-	files than in color files, and MUCH higher in progressive JPEGs.
-	If you use restarts, you may want to use larger intervals in those
-	cases.
-
-const jpeg_scan_info * scan_info
-int num_scans
-	By default, scan_info is NULL; this causes the compressor to write a
-	single-scan sequential JPEG file.  If not NULL, scan_info points to
-	an array of scan definition records of length num_scans.  The
-	compressor will then write a JPEG file having one scan for each scan
-	definition record.  This is used to generate noninterleaved or
-	progressive JPEG files.  The library checks that the scan array
-	defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
-	a suitable scan definition array for progressive JPEG.)  This is
-	discussed further under "Progressive JPEG support".
-
-int smoothing_factor
-	If non-zero, the input image is smoothed; the value should be 1 for
-	minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
-	for details of the smoothing algorithm.  The default is zero.
-
-boolean write_JFIF_header
-	If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
-	jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
-	(ie, YCbCr or grayscale) is selected, otherwise FALSE.
-
-UINT8 JFIF_major_version
-UINT8 JFIF_minor_version
-	The version number to be written into the JFIF marker.
-	jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
-	You should set it to 1.02 (major=1, minor=2) if you plan to write
-	any JFIF 1.02 extension markers.
-
-UINT8 density_unit
-UINT16 X_density
-UINT16 Y_density
-	The resolution information to be written into the JFIF marker;
-	not used otherwise.  density_unit may be 0 for unknown,
-	1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
-	indicating square pixels of unknown size.
-
-boolean write_Adobe_marker
-	If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
-	jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
-	or YCCK is selected, otherwise FALSE.  It is generally a bad idea
-	to set both write_JFIF_header and write_Adobe_marker.  In fact,
-	you probably shouldn't change the default settings at all --- the
-	default behavior ensures that the JPEG file's color space can be
-	recognized by the decoder.
-
-JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]
-	Pointers to coefficient quantization tables, one per table slot,
-	or NULL if no table is defined for a slot.  Usually these should
-	be set via one of the above helper routines; jpeg_add_quant_table()
-	is general enough to define any quantization table.  The other
-	routines will set up table slot 0 for luminance quality and table
-	slot 1 for chrominance.
-
-JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
-JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
-	Pointers to Huffman coding tables, one per table slot, or NULL if
-	no table is defined for a slot.  Slots 0 and 1 are filled with the
-	JPEG sample tables by jpeg_set_defaults().  If you need to allocate
-	more table structures, jpeg_alloc_huff_table() may be used.
-	Note that optimal Huffman tables can be computed for an image
-	by setting optimize_coding, as discussed above; there's seldom
-	any need to mess with providing your own Huffman tables.
-
-There are some additional cinfo fields which are not documented here
-because you currently can't change them; for example, you can't set
-arith_code TRUE because arithmetic coding is unsupported.
-
-
-Per-component parameters are stored in the struct cinfo.comp_info[i] for
-component number i.  Note that components here refer to components of the
-JPEG color space, *not* the source image color space.  A suitably large
-comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
-to use that routine, it's up to you to allocate the array.
-
-int component_id
-	The one-byte identifier code to be recorded in the JPEG file for
-	this component.  For the standard color spaces, we recommend you
-	leave the default values alone.
-
-int h_samp_factor
-int v_samp_factor
-	Horizontal and vertical sampling factors for the component; must
-	be 1..4 according to the JPEG standard.  Note that larger sampling
-	factors indicate a higher-resolution component; many people find
-	this behavior quite unintuitive.  The default values are 2,2 for
-	luminance components and 1,1 for chrominance components, except
-	for grayscale where 1,1 is used.
-
-int quant_tbl_no
-	Quantization table number for component.  The default value is
-	0 for luminance components and 1 for chrominance components.
-
-int dc_tbl_no
-int ac_tbl_no
-	DC and AC entropy coding table numbers.  The default values are
-	0 for luminance components and 1 for chrominance components.
-
-int component_index
-	Must equal the component's index in comp_info[].  (Beginning in
-	release v6, the compressor library will fill this in automatically;
-	you don't have to.)
-
-
-Decompression parameter selection
----------------------------------
-
-Decompression parameter selection is somewhat simpler than compression
-parameter selection, since all of the JPEG internal parameters are
-recorded in the source file and need not be supplied by the application.
-(Unless you are working with abbreviated files, in which case see
-"Abbreviated datastreams", below.)  Decompression parameters control
-the postprocessing done on the image to deliver it in a format suitable
-for the application's use.  Many of the parameters control speed/quality
-tradeoffs, in which faster decompression may be obtained at the price of
-a poorer-quality image.  The defaults select the highest quality (slowest)
-processing.
-
-The following fields in the JPEG object are set by jpeg_read_header() and
-may be useful to the application in choosing decompression parameters:
-
-JDIMENSION image_width			Width and height of image
-JDIMENSION image_height
-int num_components			Number of color components
-J_COLOR_SPACE jpeg_color_space		Colorspace of image
-boolean saw_JFIF_marker			TRUE if a JFIF APP0 marker was seen
-  UINT8 JFIF_major_version		Version information from JFIF marker
-  UINT8 JFIF_minor_version
-  UINT8 density_unit			Resolution data from JFIF marker
-  UINT16 X_density
-  UINT16 Y_density
-boolean saw_Adobe_marker		TRUE if an Adobe APP14 marker was seen
-  UINT8 Adobe_transform			Color transform code from Adobe marker
-
-The JPEG color space, unfortunately, is something of a guess since the JPEG
-standard proper does not provide a way to record it.  In practice most files
-adhere to the JFIF or Adobe conventions, and the decoder will recognize these
-correctly.  See "Special color spaces", below, for more info.
-
-
-The decompression parameters that determine the basic properties of the
-returned image are:
-
-J_COLOR_SPACE out_color_space
-	Output color space.  jpeg_read_header() sets an appropriate default
-	based on jpeg_color_space; typically it will be RGB or grayscale.
-	The application can change this field to request output in a different
-	colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
-	output from a color file.  (This is useful for previewing: grayscale
-	output is faster than full color since the color components need not
-	be processed.)  Note that not all possible color space transforms are
-	currently implemented; you may need to extend jdcolor.c if you want an
-	unusual conversion.
-
-unsigned int scale_num, scale_denom
-	Scale the image by the fraction scale_num/scale_denom.  Default is
-	1/1, or no scaling.  Currently, the only supported scaling ratios
-	are 1/1, 1/2, 1/4, and 1/8.  (The library design allows for arbitrary
-	scaling ratios but this is not likely to be implemented any time soon.)
-	Smaller scaling ratios permit significantly faster decoding since
-	fewer pixels need be processed and a simpler IDCT method can be used.
-
-boolean quantize_colors
-	If set TRUE, colormapped output will be delivered.  Default is FALSE,
-	meaning that full-color output will be delivered.
-
-The next three parameters are relevant only if quantize_colors is TRUE.
-
-int desired_number_of_colors
-	Maximum number of colors to use in generating a library-supplied color
-	map (the actual number of colors is returned in a different field).
-	Default 256.  Ignored when the application supplies its own color map.
-
-boolean two_pass_quantize
-	If TRUE, an extra pass over the image is made to select a custom color
-	map for the image.  This usually looks a lot better than the one-size-
-	fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
-	when the application supplies its own color map.
-
-J_DITHER_MODE dither_mode
-	Selects color dithering method.  Supported values are:
-		JDITHER_NONE	no dithering: fast, very low quality
-		JDITHER_ORDERED	ordered dither: moderate speed and quality
-		JDITHER_FS	Floyd-Steinberg dither: slow, high quality
-	Default is JDITHER_FS.  (At present, ordered dither is implemented
-	only in the single-pass, standard-colormap case.  If you ask for
-	ordered dither when two_pass_quantize is TRUE or when you supply
-	an external color map, you'll get F-S dithering.)
-
-When quantize_colors is TRUE, the target color map is described by the next
-two fields.  colormap is set to NULL by jpeg_read_header().  The application
-can supply a color map by setting colormap non-NULL and setting
-actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
-selects a suitable color map and sets these two fields itself.
-[Implementation restriction: at present, an externally supplied colormap is
-only accepted for 3-component output color spaces.]
-
-JSAMPARRAY colormap
-	The color map, represented as a 2-D pixel array of out_color_components
-	rows and actual_number_of_colors columns.  Ignored if not quantizing.
-	CAUTION: if the JPEG library creates its own colormap, the storage
-	pointed to by this field is released by jpeg_finish_decompress().
-	Copy the colormap somewhere else first, if you want to save it.
-
-int actual_number_of_colors
-	The number of colors in the color map.
-
-Additional decompression parameters that the application may set include:
-
-J_DCT_METHOD dct_method
-	Selects the algorithm used for the DCT step.  Choices are the same
-	as described above for compression.
-
-boolean do_fancy_upsampling
-	If TRUE, do careful upsampling of chroma components.  If FALSE,
-	a faster but sloppier method is used.  Default is TRUE.  The visual
-	impact of the sloppier method is often very small.
-
-boolean do_block_smoothing
-	If TRUE, interblock smoothing is applied in early stages of decoding
-	progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
-	progression stages look "fuzzy" with smoothing, "blocky" without.
-	In any case, block smoothing ceases to be applied after the first few
-	AC coefficients are known to full accuracy, so it is relevant only
-	when using buffered-image mode for progressive images.
-
-boolean enable_1pass_quant
-boolean enable_external_quant
-boolean enable_2pass_quant
-	These are significant only in buffered-image mode, which is
-	described in its own section below.
-
-
-The output image dimensions are given by the following fields.  These are
-computed from the source image dimensions and the decompression parameters
-by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
-to obtain the values that will result from the current parameter settings.
-This can be useful if you are trying to pick a scaling ratio that will get
-close to a desired target size.  It's also important if you are using the
-JPEG library's memory manager to allocate output buffer space, because you
-are supposed to request such buffers *before* jpeg_start_decompress().
-
-JDIMENSION output_width		Actual dimensions of output image.
-JDIMENSION output_height
-int out_color_components	Number of color components in out_color_space.
-int output_components		Number of color components returned.
-int rec_outbuf_height		Recommended height of scanline buffer.
-
-When quantizing colors, output_components is 1, indicating a single color map
-index per pixel.  Otherwise it equals out_color_components.  The output arrays
-are required to be output_width * output_components JSAMPLEs wide.
-
-rec_outbuf_height is the recommended minimum height (in scanlines) of the
-buffer passed to jpeg_read_scanlines().  If the buffer is smaller, the
-library will still work, but time will be wasted due to unnecessary data
-copying.  In high-quality modes, rec_outbuf_height is always 1, but some
-faster, lower-quality modes set it to larger values (typically 2 to 4).
-If you are going to ask for a high-speed processing mode, you may as well
-go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
-(An output buffer larger than rec_outbuf_height lines is OK, but won't
-provide any material speed improvement over that height.)
-
-
-Special color spaces
---------------------
-
-The JPEG standard itself is "color blind" and doesn't specify any particular
-color space.  It is customary to convert color data to a luminance/chrominance
-color space before compressing, since this permits greater compression.  The
-existing de-facto JPEG file format standards specify YCbCr or grayscale data
-(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
-applications such as multispectral images, other color spaces can be used,
-but it must be understood that such files will be unportable.
-
-The JPEG library can handle the most common colorspace conversions (namely
-RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
-color space, passing it through without conversion.  If you deal extensively
-with an unusual color space, you can easily extend the library to understand
-additional color spaces and perform appropriate conversions.
-
-For compression, the source data's color space is specified by field
-in_color_space.  This is transformed to the JPEG file's color space given
-by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
-space depending on in_color_space, but you can override this by calling
-jpeg_set_colorspace().  Of course you must select a supported transformation.
-jccolor.c currently supports the following transformations:
-	RGB => YCbCr
-	RGB => GRAYSCALE
-	YCbCr => GRAYSCALE
-	CMYK => YCCK
-plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
-YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
-
-The de-facto file format standards (JFIF and Adobe) specify APPn markers that
-indicate the color space of the JPEG file.  It is important to ensure that
-these are written correctly, or omitted if the JPEG file's color space is not
-one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
-will set the compression parameters to include or omit the APPn markers
-properly, so long as it is told the truth about the JPEG color space.
-For example, if you are writing some random 3-component color space without
-conversion, don't try to fake out the library by setting in_color_space and
-jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
-APPn marker of your own devising to identify the colorspace --- see "Special
-markers", below.
-
-When told that the color space is UNKNOWN, the library will default to using
-luminance-quality compression parameters for all color components.  You may
-well want to change these parameters.  See the source code for
-jpeg_set_colorspace(), in jcparam.c, for details.
-
-For decompression, the JPEG file's color space is given in jpeg_color_space,
-and this is transformed to the output color space out_color_space.
-jpeg_read_header's setting of jpeg_color_space can be relied on if the file
-conforms to JFIF or Adobe conventions, but otherwise it is no better than a
-guess.  If you know the JPEG file's color space for certain, you can override
-jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
-selects a default output color space based on (its guess of) jpeg_color_space;
-set out_color_space to override this.  Again, you must select a supported
-transformation.  jdcolor.c currently supports
-	YCbCr => GRAYSCALE
-	YCbCr => RGB
-	GRAYSCALE => RGB
-	YCCK => CMYK
-as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
-application can force grayscale JPEGs to look like color JPEGs if it only
-wants to handle one case.)
-
-The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
-(it weights distances appropriately for RGB colors).  You'll need to modify
-the code if you want to use it for non-RGB output color spaces.  Note that
-jquant2.c is used to map to an application-supplied colormap as well as for
-the normal two-pass colormap selection process.
-
-CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
-files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
-This is arguably a bug in Photoshop, but if you need to work with Photoshop
-CMYK files, you will have to deal with it in your application.  We cannot
-"fix" this in the library by inverting the data during the CMYK<=>YCCK
-transform, because that would break other applications, notably Ghostscript.
-Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
-data in the same inverted-YCCK representation used in bare JPEG files, but
-the surrounding PostScript code performs an inversion using the PS image
-operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
-EPS/JPEG files, and will omit the PS-level inversion.  (But the data
-polarity used in bare JPEG files will not change in 3.0.)  In either case,
-the JPEG library must not invert the data itself, or else Ghostscript would
-read these EPS files incorrectly.
-
-
-Error handling
---------------
-
-When the default error handler is used, any error detected inside the JPEG
-routines will cause a message to be printed on stderr, followed by exit().
-You can supply your own error handling routines to override this behavior
-and to control the treatment of nonfatal warnings and trace/debug messages.
-The file example.c illustrates the most common case, which is to have the
-application regain control after an error rather than exiting.
-
-The JPEG library never writes any message directly; it always goes through
-the error handling routines.  Three classes of messages are recognized:
-  * Fatal errors: the library cannot continue.
-  * Warnings: the library can continue, but the data is corrupt, and a
-    damaged output image is likely to result.
-  * Trace/informational messages.  These come with a trace level indicating
-    the importance of the message; you can control the verbosity of the
-    program by adjusting the maximum trace level that will be displayed.
-
-You may, if you wish, simply replace the entire JPEG error handling module
-(jerror.c) with your own code.  However, you can avoid code duplication by
-only replacing some of the routines depending on the behavior you need.
-This is accomplished by calling jpeg_std_error() as usual, but then overriding
-some of the method pointers in the jpeg_error_mgr struct, as illustrated by
-example.c.
-
-All of the error handling routines will receive a pointer to the JPEG object
-(a j_common_ptr which points to either a jpeg_compress_struct or a
-jpeg_decompress_struct; if you need to tell which, test the is_decompressor
-field).  This struct includes a pointer to the error manager struct in its
-"err" field.  Frequently, custom error handler routines will need to access
-additional data which is not known to the JPEG library or the standard error
-handler.  The most convenient way to do this is to embed either the JPEG
-object or the jpeg_error_mgr struct in a larger structure that contains
-additional fields; then casting the passed pointer provides access to the
-additional fields.  Again, see example.c for one way to do it.  (Beginning
-with IJG version 6b, there is also a void pointer "client_data" in each
-JPEG object, which the application can also use to find related data.
-The library does not touch client_data at all.)
-
-The individual methods that you might wish to override are:
-
-error_exit (j_common_ptr cinfo)
-	Receives control for a fatal error.  Information sufficient to
-	generate the error message has been stored in cinfo->err; call
-	output_message to display it.  Control must NOT return to the caller;
-	generally this routine will exit() or longjmp() somewhere.
-	Typically you would override this routine to get rid of the exit()
-	default behavior.  Note that if you continue processing, you should
-	clean up the JPEG object with jpeg_abort() or jpeg_destroy().
-
-output_message (j_common_ptr cinfo)
-	Actual output of any JPEG message.  Override this to send messages
-	somewhere other than stderr.  Note that this method does not know
-	how to generate a message, only where to send it.
-
-format_message (j_common_ptr cinfo, char * buffer)
-	Constructs a readable error message string based on the error info
-	stored in cinfo->err.  This method is called by output_message.  Few
-	applications should need to override this method.  One possible
-	reason for doing so is to implement dynamic switching of error message
-	language.
-
-emit_message (j_common_ptr cinfo, int msg_level)
-	Decide whether or not to emit a warning or trace message; if so,
-	calls output_message.  The main reason for overriding this method
-	would be to abort on warnings.  msg_level is -1 for warnings,
-	0 and up for trace messages.
-
-Only error_exit() and emit_message() are called from the rest of the JPEG
-library; the other two are internal to the error handler.
-
-The actual message texts are stored in an array of strings which is pointed to
-by the field err->jpeg_message_table.  The messages are numbered from 0 to
-err->last_jpeg_message, and it is these code numbers that are used in the
-JPEG library code.  You could replace the message texts (for instance, with
-messages in French or German) by changing the message table pointer.  See
-jerror.h for the default texts.  CAUTION: this table will almost certainly
-change or grow from one library version to the next.
-
-It may be useful for an application to add its own message texts that are
-handled by the same mechanism.  The error handler supports a second "add-on"
-message table for this purpose.  To define an addon table, set the pointer
-err->addon_message_table and the message numbers err->first_addon_message and
-err->last_addon_message.  If you number the addon messages beginning at 1000
-or so, you won't have to worry about conflicts with the library's built-in
-messages.  See the sample applications cjpeg/djpeg for an example of using
-addon messages (the addon messages are defined in cderror.h).
-
-Actual invocation of the error handler is done via macros defined in jerror.h:
-	ERREXITn(...)	for fatal errors
-	WARNMSn(...)	for corrupt-data warnings
-	TRACEMSn(...)	for trace and informational messages.
-These macros store the message code and any additional parameters into the
-error handler struct, then invoke the error_exit() or emit_message() method.
-The variants of each macro are for varying numbers of additional parameters.
-The additional parameters are inserted into the generated message using
-standard printf() format codes.
-
-See jerror.h and jerror.c for further details.
-
-
-Compressed data handling (source and destination managers)
-----------------------------------------------------------
-
-The JPEG compression library sends its compressed data to a "destination
-manager" module.  The default destination manager just writes the data to a
-stdio stream, but you can provide your own manager to do something else.
-Similarly, the decompression library calls a "source manager" to obtain the
-compressed data; you can provide your own source manager if you want the data
-to come from somewhere other than a stdio stream.
-
-In both cases, compressed data is processed a bufferload at a time: the
-destination or source manager provides a work buffer, and the library invokes
-the manager only when the buffer is filled or emptied.  (You could define a
-one-character buffer to force the manager to be invoked for each byte, but
-that would be rather inefficient.)  The buffer's size and location are
-controlled by the manager, not by the library.  For example, if you desired to
-decompress a JPEG datastream that was all in memory, you could just make the
-buffer pointer and length point to the original data in memory.  Then the
-buffer-reload procedure would be invoked only if the decompressor ran off the
-end of the datastream, which would indicate an erroneous datastream.
-
-The work buffer is defined as an array of datatype JOCTET, which is generally
-"char" or "unsigned char".  On a machine where char is not exactly 8 bits
-wide, you must define JOCTET as a wider data type and then modify the data
-source and destination modules to transcribe the work arrays into 8-bit units
-on external storage.
-
-A data destination manager struct contains a pointer and count defining the
-next byte to write in the work buffer and the remaining free space:
-
-	JOCTET * next_output_byte;  /* => next byte to write in buffer */
-	size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
-
-The library increments the pointer and decrements the count until the buffer
-is filled.  The manager's empty_output_buffer method must reset the pointer
-and count.  The manager is expected to remember the buffer's starting address
-and total size in private fields not visible to the library.
-
-A data destination manager provides three methods:
-
-init_destination (j_compress_ptr cinfo)
-	Initialize destination.  This is called by jpeg_start_compress()
-	before any data is actually written.  It must initialize
-	next_output_byte and free_in_buffer.  free_in_buffer must be
-	initialized to a positive value.
-
-empty_output_buffer (j_compress_ptr cinfo)
-	This is called whenever the buffer has filled (free_in_buffer
-	reaches zero).  In typical applications, it should write out the
-	*entire* buffer (use the saved start address and buffer length;
-	ignore the current state of next_output_byte and free_in_buffer).
-	Then reset the pointer & count to the start of the buffer, and
-	return TRUE indicating that the buffer has been dumped.
-	free_in_buffer must be set to a positive value when TRUE is
-	returned.  A FALSE return should only be used when I/O suspension is
-	desired (this operating mode is discussed in the next section).
-
-term_destination (j_compress_ptr cinfo)
-	Terminate destination --- called by jpeg_finish_compress() after all
-	data has been written.  In most applications, this must flush any
-	data remaining in the buffer.  Use either next_output_byte or
-	free_in_buffer to determine how much data is in the buffer.
-
-term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
-want the destination manager to be cleaned up during an abort, you must do it
-yourself.
-
-You will also need code to create a jpeg_destination_mgr struct, fill in its
-method pointers, and insert a pointer to the struct into the "dest" field of
-the JPEG compression object.  This can be done in-line in your setup code if
-you like, but it's probably cleaner to provide a separate routine similar to
-the jpeg_stdio_dest() routine of the supplied destination manager.
-
-Decompression source managers follow a parallel design, but with some
-additional frammishes.  The source manager struct contains a pointer and count
-defining the next byte to read from the work buffer and the number of bytes
-remaining:
-
-	const JOCTET * next_input_byte; /* => next byte to read from buffer */
-	size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
-
-The library increments the pointer and decrements the count until the buffer
-is emptied.  The manager's fill_input_buffer method must reset the pointer and
-count.  In most applications, the manager must remember the buffer's starting
-address and total size in private fields not visible to the library.
-
-A data source manager provides five methods:
-
-init_source (j_decompress_ptr cinfo)
-	Initialize source.  This is called by jpeg_read_header() before any
-	data is actually read.  Unlike init_destination(), it may leave
-	bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
-	will occur immediately).
-
-fill_input_buffer (j_decompress_ptr cinfo)
-	This is called whenever bytes_in_buffer has reached zero and more
-	data is wanted.  In typical applications, it should read fresh data
-	into the buffer (ignoring the current state of next_input_byte and
-	bytes_in_buffer), reset the pointer & count to the start of the
-	buffer, and return TRUE indicating that the buffer has been reloaded.
-	It is not necessary to fill the buffer entirely, only to obtain at
-	least one more byte.  bytes_in_buffer MUST be set to a positive value
-	if TRUE is returned.  A FALSE return should only be used when I/O
-	suspension is desired (this mode is discussed in the next section).
-
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
-	Skip num_bytes worth of data.  The buffer pointer and count should
-	be advanced over num_bytes input bytes, refilling the buffer as
-	needed.  This is used to skip over a potentially large amount of
-	uninteresting data (such as an APPn marker).  In some applications
-	it may be possible to optimize away the reading of the skipped data,
-	but it's not clear that being smart is worth much trouble; large
-	skips are uncommon.  bytes_in_buffer may be zero on return.
-	A zero or negative skip count should be treated as a no-op.
-
-resync_to_restart (j_decompress_ptr cinfo, int desired)
-	This routine is called only when the decompressor has failed to find
-	a restart (RSTn) marker where one is expected.  Its mission is to
-	find a suitable point for resuming decompression.  For most
-	applications, we recommend that you just use the default resync
-	procedure, jpeg_resync_to_restart().  However, if you are able to back
-	up in the input data stream, or if you have a-priori knowledge about
-	the likely location of restart markers, you may be able to do better.
-	Read the read_restart_marker() and jpeg_resync_to_restart() routines
-	in jdmarker.c if you think you'd like to implement your own resync
-	procedure.
-
-term_source (j_decompress_ptr cinfo)
-	Terminate source --- called by jpeg_finish_decompress() after all
-	data has been read.  Often a no-op.
-
-For both fill_input_buffer() and skip_input_data(), there is no such thing
-as an EOF return.  If the end of the file has been reached, the routine has
-a choice of exiting via ERREXIT() or inserting fake data into the buffer.
-In most cases, generating a warning message and inserting a fake EOI marker
-is the best course of action --- this will allow the decompressor to output
-however much of the image is there.  In pathological cases, the decompressor
-may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
-jdatasrc.c illustrates the recommended error recovery behavior.
-
-term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
-the source manager to be cleaned up during an abort, you must do it yourself.
-
-You will also need code to create a jpeg_source_mgr struct, fill in its method
-pointers, and insert a pointer to the struct into the "src" field of the JPEG
-decompression object.  This can be done in-line in your setup code if you
-like, but it's probably cleaner to provide a separate routine similar to the
-jpeg_stdio_src() routine of the supplied source manager.
-
-For more information, consult the stdio source and destination managers
-in jdatasrc.c and jdatadst.c.
-
-
-I/O suspension
---------------
-
-Some applications need to use the JPEG library as an incremental memory-to-
-memory filter: when the compressed data buffer is filled or emptied, they want
-control to return to the outer loop, rather than expecting that the buffer can
-be emptied or reloaded within the data source/destination manager subroutine.
-The library supports this need by providing an "I/O suspension" mode, which we
-describe in this section.
-
-The I/O suspension mode is not a panacea: nothing is guaranteed about the
-maximum amount of time spent in any one call to the library, so it will not
-eliminate response-time problems in single-threaded applications.  If you
-need guaranteed response time, we suggest you "bite the bullet" and implement
-a real multi-tasking capability.
-
-To use I/O suspension, cooperation is needed between the calling application
-and the data source or destination manager; you will always need a custom
-source/destination manager.  (Please read the previous section if you haven't
-already.)  The basic idea is that the empty_output_buffer() or
-fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
-that it has done nothing.  Upon seeing this, the JPEG library suspends
-operation and returns to its caller.  The surrounding application is
-responsible for emptying or refilling the work buffer before calling the
-JPEG library again.
-
-Compression suspension:
-
-For compression suspension, use an empty_output_buffer() routine that returns
-FALSE; typically it will not do anything else.  This will cause the
-compressor to return to the caller of jpeg_write_scanlines(), with the return
-value indicating that not all the supplied scanlines have been accepted.
-The application must make more room in the output buffer, adjust the output
-buffer pointer/count appropriately, and then call jpeg_write_scanlines()
-again, pointing to the first unconsumed scanline.
-
-When forced to suspend, the compressor will backtrack to a convenient stopping
-point (usually the start of the current MCU); it will regenerate some output
-data when restarted.  Therefore, although empty_output_buffer() is only
-called when the buffer is filled, you should NOT write out the entire buffer
-after a suspension.  Write only the data up to the current position of
-next_output_byte/free_in_buffer.  The data beyond that point will be
-regenerated after resumption.
-
-Because of the backtracking behavior, a good-size output buffer is essential
-for efficiency; you don't want the compressor to suspend often.  (In fact, an
-overly small buffer could lead to infinite looping, if a single MCU required
-more data than would fit in the buffer.)  We recommend a buffer of at least
-several Kbytes.  You may want to insert explicit code to ensure that you don't
-call jpeg_write_scanlines() unless there is a reasonable amount of space in
-the output buffer; in other words, flush the buffer before trying to compress
-more data.
-
-The compressor does not allow suspension while it is trying to write JPEG
-markers at the beginning and end of the file.  This means that:
-  * At the beginning of a compression operation, there must be enough free
-    space in the output buffer to hold the header markers (typically 600 or
-    so bytes).  The recommended buffer size is bigger than this anyway, so
-    this is not a problem as long as you start with an empty buffer.  However,
-    this restriction might catch you if you insert large special markers, such
-    as a JFIF thumbnail image, without flushing the buffer afterwards.
-  * When you call jpeg_finish_compress(), there must be enough space in the
-    output buffer to emit any buffered data and the final EOI marker.  In the
-    current implementation, half a dozen bytes should suffice for this, but
-    for safety's sake we recommend ensuring that at least 100 bytes are free
-    before calling jpeg_finish_compress().
-
-A more significant restriction is that jpeg_finish_compress() cannot suspend.
-This means you cannot use suspension with multi-pass operating modes, namely
-Huffman code optimization and multiple-scan output.  Those modes write the
-whole file during jpeg_finish_compress(), which will certainly result in
-buffer overrun.  (Note that this restriction applies only to compression,
-not decompression.  The decompressor supports input suspension in all of its
-operating modes.)
-
-Decompression suspension:
-
-For decompression suspension, use a fill_input_buffer() routine that simply
-returns FALSE (except perhaps during error recovery, as discussed below).
-This will cause the decompressor to return to its caller with an indication
-that suspension has occurred.  This can happen at four places:
-  * jpeg_read_header(): will return JPEG_SUSPENDED.
-  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
-  * jpeg_read_scanlines(): will return the number of scanlines already
-	completed (possibly 0).
-  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
-The surrounding application must recognize these cases, load more data into
-the input buffer, and repeat the call.  In the case of jpeg_read_scanlines(),
-increment the passed pointers past any scanlines successfully read.
-
-Just as with compression, the decompressor will typically backtrack to a
-convenient restart point before suspending.  When fill_input_buffer() is
-called, next_input_byte/bytes_in_buffer point to the current restart point,
-which is where the decompressor will backtrack to if FALSE is returned.
-The data beyond that position must NOT be discarded if you suspend; it needs
-to be re-read upon resumption.  In most implementations, you'll need to shift
-this data down to the start of your work buffer and then load more data after
-it.  Again, this behavior means that a several-Kbyte work buffer is essential
-for decent performance; furthermore, you should load a reasonable amount of
-new data before resuming decompression.  (If you loaded, say, only one new
-byte each time around, you could waste a LOT of cycles.)
-
-The skip_input_data() source manager routine requires special care in a
-suspension scenario.  This routine is NOT granted the ability to suspend the
-decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
-requested skip distance exceeds the amount of data currently in the input
-buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
-additional skip distance somewhere else.  The decompressor will immediately
-call fill_input_buffer(), which should return FALSE, which will cause a
-suspension return.  The surrounding application must then arrange to discard
-the recorded number of bytes before it resumes loading the input buffer.
-(Yes, this design is rather baroque, but it avoids complexity in the far more
-common case where a non-suspending source manager is used.)
-
-If the input data has been exhausted, we recommend that you emit a warning
-and insert dummy EOI markers just as a non-suspending data source manager
-would do.  This can be handled either in the surrounding application logic or
-within fill_input_buffer(); the latter is probably more efficient.  If
-fill_input_buffer() knows that no more data is available, it can set the
-pointer/count to point to a dummy EOI marker and then return TRUE just as
-though it had read more data in a non-suspending situation.
-
-The decompressor does not attempt to suspend within standard JPEG markers;
-instead it will backtrack to the start of the marker and reprocess the whole
-marker next time.  Hence the input buffer must be large enough to hold the
-longest standard marker in the file.  Standard JPEG markers should normally
-not exceed a few hundred bytes each (DHT tables are typically the longest).
-We recommend at least a 2K buffer for performance reasons, which is much
-larger than any correct marker is likely to be.  For robustness against
-damaged marker length counts, you may wish to insert a test in your
-application for the case that the input buffer is completely full and yet
-the decoder has suspended without consuming any data --- otherwise, if this
-situation did occur, it would lead to an endless loop.  (The library can't
-provide this test since it has no idea whether "the buffer is full", or
-even whether there is a fixed-size input buffer.)
-
-The input buffer would need to be 64K to allow for arbitrary COM or APPn
-markers, but these are handled specially: they are either saved into allocated
-memory, or skipped over by calling skip_input_data().  In the former case,
-suspension is handled correctly, and in the latter case, the problem of
-buffer overrun is placed on skip_input_data's shoulders, as explained above.
-Note that if you provide your own marker handling routine for large markers,
-you should consider how to deal with buffer overflow.
-
-Multiple-buffer management:
-
-In some applications it is desirable to store the compressed data in a linked
-list of buffer areas, so as to avoid data copying.  This can be handled by
-having empty_output_buffer() or fill_input_buffer() set the pointer and count
-to reference the next available buffer; FALSE is returned only if no more
-buffers are available.  Although seemingly straightforward, there is a
-pitfall in this approach: the backtrack that occurs when FALSE is returned
-could back up into an earlier buffer.  For example, when fill_input_buffer()
-is called, the current pointer & count indicate the backtrack restart point.
-Since fill_input_buffer() will set the pointer and count to refer to a new
-buffer, the restart position must be saved somewhere else.  Suppose a second
-call to fill_input_buffer() occurs in the same library call, and no
-additional input data is available, so fill_input_buffer must return FALSE.
-If the JPEG library has not moved the pointer/count forward in the current
-buffer, then *the correct restart point is the saved position in the prior
-buffer*.  Prior buffers may be discarded only after the library establishes
-a restart point within a later buffer.  Similar remarks apply for output into
-a chain of buffers.
-
-The library will never attempt to backtrack over a skip_input_data() call,
-so any skipped data can be permanently discarded.  You still have to deal
-with the case of skipping not-yet-received data, however.
-
-It's much simpler to use only a single buffer; when fill_input_buffer() is
-called, move any unconsumed data (beyond the current pointer/count) down to
-the beginning of this buffer and then load new data into the remaining buffer
-space.  This approach requires a little more data copying but is far easier
-to get right.
-
-
-Progressive JPEG support
-------------------------
-
-Progressive JPEG rearranges the stored data into a series of scans of
-increasing quality.  In situations where a JPEG file is transmitted across a
-slow communications link, a decoder can generate a low-quality image very
-quickly from the first scan, then gradually improve the displayed quality as
-more scans are received.  The final image after all scans are complete is
-identical to that of a regular (sequential) JPEG file of the same quality
-setting.  Progressive JPEG files are often slightly smaller than equivalent
-sequential JPEG files, but the possibility of incremental display is the main
-reason for using progressive JPEG.
-
-The IJG encoder library generates progressive JPEG files when given a
-suitable "scan script" defining how to divide the data into scans.
-Creation of progressive JPEG files is otherwise transparent to the encoder.
-Progressive JPEG files can also be read transparently by the decoder library.
-If the decoding application simply uses the library as defined above, it
-will receive a final decoded image without any indication that the file was
-progressive.  Of course, this approach does not allow incremental display.
-To perform incremental display, an application needs to use the decoder
-library's "buffered-image" mode, in which it receives a decoded image
-multiple times.
-
-Each displayed scan requires about as much work to decode as a full JPEG
-image of the same size, so the decoder must be fairly fast in relation to the
-data transmission rate in order to make incremental display useful.  However,
-it is possible to skip displaying the image and simply add the incoming bits
-to the decoder's coefficient buffer.  This is fast because only Huffman
-decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
-The IJG decoder library allows the application to switch dynamically between
-displaying the image and simply absorbing the incoming bits.  A properly
-coded application can automatically adapt the number of display passes to
-suit the time available as the image is received.  Also, a final
-higher-quality display cycle can be performed from the buffered data after
-the end of the file is reached.
-
-Progressive compression:
-
-To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
-set the scan_info cinfo field to point to an array of scan descriptors, and
-perform compression as usual.  Instead of constructing your own scan list,
-you can call the jpeg_simple_progression() helper routine to create a
-recommended progression sequence; this method should be used by all
-applications that don't want to get involved in the nitty-gritty of
-progressive scan sequence design.  (If you want to provide user control of
-scan sequences, you may wish to borrow the scan script reading code found
-in rdswitch.c, so that you can read scan script files just like cjpeg's.)
-When scan_info is not NULL, the compression library will store DCT'd data
-into a buffer array as jpeg_write_scanlines() is called, and will emit all
-the requested scans during jpeg_finish_compress().  This implies that
-multiple-scan output cannot be created with a suspending data destination
-manager, since jpeg_finish_compress() does not support suspension.  We
-should also note that the compressor currently forces Huffman optimization
-mode when creating a progressive JPEG file, because the default Huffman
-tables are unsuitable for progressive files.
-
-Progressive decompression:
-
-When buffered-image mode is not used, the decoder library will read all of
-a multi-scan file during jpeg_start_decompress(), so that it can provide a
-final decoded image.  (Here "multi-scan" means either progressive or
-multi-scan sequential.)  This makes multi-scan files transparent to the
-decoding application.  However, existing applications that used suspending
-input with version 5 of the IJG library will need to be modified to check
-for a suspension return from jpeg_start_decompress().
-
-To perform incremental display, an application must use the library's
-buffered-image mode.  This is described in the next section.
-
-
-Buffered-image mode
--------------------
-
-In buffered-image mode, the library stores the partially decoded image in a
-coefficient buffer, from which it can be read out as many times as desired.
-This mode is typically used for incremental display of progressive JPEG files,
-but it can be used with any JPEG file.  Each scan of a progressive JPEG file
-adds more data (more detail) to the buffered image.  The application can
-display in lockstep with the source file (one display pass per input scan),
-or it can allow input processing to outrun display processing.  By making
-input and display processing run independently, it is possible for the
-application to adapt progressive display to a wide range of data transmission
-rates.
-
-The basic control flow for buffered-image decoding is
-
-	jpeg_create_decompress()
-	set data source
-	jpeg_read_header()
-	set overall decompression parameters
-	cinfo.buffered_image = TRUE;	/* select buffered-image mode */
-	jpeg_start_decompress()
-	for (each output pass) {
-	    adjust output decompression parameters if required
-	    jpeg_start_output()		/* start a new output pass */
-	    for (all scanlines in image) {
-	        jpeg_read_scanlines()
-	        display scanlines
-	    }
-	    jpeg_finish_output()	/* terminate output pass */
-	}
-	jpeg_finish_decompress()
-	jpeg_destroy_decompress()
-
-This differs from ordinary unbuffered decoding in that there is an additional
-level of looping.  The application can choose how many output passes to make
-and how to display each pass.
-
-The simplest approach to displaying progressive images is to do one display
-pass for each scan appearing in the input file.  In this case the outer loop
-condition is typically
-	while (! jpeg_input_complete(&cinfo))
-and the start-output call should read
-	jpeg_start_output(&cinfo, cinfo.input_scan_number);
-The second parameter to jpeg_start_output() indicates which scan of the input
-file is to be displayed; the scans are numbered starting at 1 for this
-purpose.  (You can use a loop counter starting at 1 if you like, but using
-the library's input scan counter is easier.)  The library automatically reads
-data as necessary to complete each requested scan, and jpeg_finish_output()
-advances to the next scan or end-of-image marker (hence input_scan_number
-will be incremented by the time control arrives back at jpeg_start_output()).
-With this technique, data is read from the input file only as needed, and
-input and output processing run in lockstep.
-
-After reading the final scan and reaching the end of the input file, the
-buffered image remains available; it can be read additional times by
-repeating the jpeg_start_output()/jpeg_read_scanlines()/jpeg_finish_output()
-sequence.  For example, a useful technique is to use fast one-pass color
-quantization for display passes made while the image is arriving, followed by
-a final display pass using two-pass quantization for highest quality.  This
-is done by changing the library parameters before the final output pass.
-Changing parameters between passes is discussed in detail below.
-
-In general the last scan of a progressive file cannot be recognized as such
-until after it is read, so a post-input display pass is the best approach if
-you want special processing in the final pass.
-
-When done with the image, be sure to call jpeg_finish_decompress() to release
-the buffered image (or just use jpeg_destroy_decompress()).
-
-If input data arrives faster than it can be displayed, the application can
-cause the library to decode input data in advance of what's needed to produce
-output.  This is done by calling the routine jpeg_consume_input().
-The return value is one of the following:
-	JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
-	JPEG_REACHED_EOI:    reached the EOI marker (end of image)
-	JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
-	JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
-	JPEG_SUSPENDED:      suspended before completing any of the above
-(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
-routine can be called at any time after initializing the JPEG object.  It
-reads some additional data and returns when one of the indicated significant
-events occurs.  (If called after the EOI marker is reached, it will
-immediately return JPEG_REACHED_EOI without attempting to read more data.)
-
-The library's output processing will automatically call jpeg_consume_input()
-whenever the output processing overtakes the input; thus, simple lockstep
-display requires no direct calls to jpeg_consume_input().  But by adding
-calls to jpeg_consume_input(), you can absorb data in advance of what is
-being displayed.  This has two benefits:
-  * You can limit buildup of unprocessed data in your input buffer.
-  * You can eliminate extra display passes by paying attention to the
-    state of the library's input processing.
-
-The first of these benefits only requires interspersing calls to
-jpeg_consume_input() with your display operations and any other processing
-you may be doing.  To avoid wasting cycles due to backtracking, it's best to
-call jpeg_consume_input() only after a hundred or so new bytes have arrived.
-This is discussed further under "I/O suspension", above.  (Note: the JPEG
-library currently is not thread-safe.  You must not call jpeg_consume_input()
-from one thread of control if a different library routine is working on the
-same JPEG object in another thread.)
-
-When input arrives fast enough that more than one new scan is available
-before you start a new output pass, you may as well skip the output pass
-corresponding to the completed scan.  This occurs for free if you pass
-cinfo.input_scan_number as the target scan number to jpeg_start_output().
-The input_scan_number field is simply the index of the scan currently being
-consumed by the input processor.  You can ensure that this is up-to-date by
-emptying the input buffer just before calling jpeg_start_output(): call
-jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
-JPEG_REACHED_EOI.
-
-The target scan number passed to jpeg_start_output() is saved in the
-cinfo.output_scan_number field.  The library's output processing calls
-jpeg_consume_input() whenever the current input scan number and row within
-that scan is less than or equal to the current output scan number and row.
-Thus, input processing can "get ahead" of the output processing but is not
-allowed to "fall behind".  You can achieve several different effects by
-manipulating this interlock rule.  For example, if you pass a target scan
-number greater than the current input scan number, the output processor will
-wait until that scan starts to arrive before producing any output.  (To avoid
-an infinite loop, the target scan number is automatically reset to the last
-scan number when the end of image is reached.  Thus, if you specify a large
-target scan number, the library will just absorb the entire input file and
-then perform an output pass.  This is effectively the same as what
-jpeg_start_decompress() does when you don't select buffered-image mode.)
-When you pass a target scan number equal to the current input scan number,
-the image is displayed no faster than the current input scan arrives.  The
-final possibility is to pass a target scan number less than the current input
-scan number; this disables the input/output interlock and causes the output
-processor to simply display whatever it finds in the image buffer, without
-waiting for input.  (However, the library will not accept a target scan
-number less than one, so you can't avoid waiting for the first scan.)
-
-When data is arriving faster than the output display processing can advance
-through the image, jpeg_consume_input() will store data into the buffered
-image beyond the point at which the output processing is reading data out
-again.  If the input arrives fast enough, it may "wrap around" the buffer to
-the point where the input is more than one whole scan ahead of the output.
-If the output processing simply proceeds through its display pass without
-paying attention to the input, the effect seen on-screen is that the lower
-part of the image is one or more scans better in quality than the upper part.
-Then, when the next output scan is started, you have a choice of what target
-scan number to use.  The recommended choice is to use the current input scan
-number at that time, which implies that you've skipped the output scans
-corresponding to the input scans that were completed while you processed the
-previous output scan.  In this way, the decoder automatically adapts its
-speed to the arriving data, by skipping output scans as necessary to keep up
-with the arriving data.
-
-When using this strategy, you'll want to be sure that you perform a final
-output pass after receiving all the data; otherwise your last display may not
-be full quality across the whole screen.  So the right outer loop logic is
-something like this:
-	do {
-	    absorb any waiting input by calling jpeg_consume_input()
-	    final_pass = jpeg_input_complete(&cinfo);
-	    adjust output decompression parameters if required
-	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
-	    ...
-	    jpeg_finish_output()
-	} while (! final_pass);
-rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
-arrangement makes it simple to use higher-quality decoding parameters
-for the final pass.  But if you don't want to use special parameters for
-the final pass, the right loop logic is like this:
-	for (;;) {
-	    absorb any waiting input by calling jpeg_consume_input()
-	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
-	    ...
-	    jpeg_finish_output()
-	    if (jpeg_input_complete(&cinfo) &&
-	        cinfo.input_scan_number == cinfo.output_scan_number)
-	      break;
-	}
-In this case you don't need to know in advance whether an output pass is to
-be the last one, so it's not necessary to have reached EOF before starting
-the final output pass; rather, what you want to test is whether the output
-pass was performed in sync with the final input scan.  This form of the loop
-will avoid an extra output pass whenever the decoder is able (or nearly able)
-to keep up with the incoming data.
-
-When the data transmission speed is high, you might begin a display pass,
-then find that much or all of the file has arrived before you can complete
-the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
-from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
-In this situation you may wish to abort the current display pass and start a
-new one using the newly arrived information.  To do so, just call
-jpeg_finish_output() and then start a new pass with jpeg_start_output().
-
-A variant strategy is to abort and restart display if more than one complete
-scan arrives during an output pass; this can be detected by noting
-JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
-idea should be employed with caution, however, since the display process
-might never get to the bottom of the image before being aborted, resulting
-in the lower part of the screen being several passes worse than the upper.
-In most cases it's probably best to abort an output pass only if the whole
-file has arrived and you want to begin the final output pass immediately.
-
-When receiving data across a communication link, we recommend always using
-the current input scan number for the output target scan number; if a
-higher-quality final pass is to be done, it should be started (aborting any
-incomplete output pass) as soon as the end of file is received.  However,
-many other strategies are possible.  For example, the application can examine
-the parameters of the current input scan and decide whether to display it or
-not.  If the scan contains only chroma data, one might choose not to use it
-as the target scan, expecting that the scan will be small and will arrive
-quickly.  To skip to the next scan, call jpeg_consume_input() until it
-returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
-number as the target scan for jpeg_start_output(); but that method doesn't
-let you inspect the next scan's parameters before deciding to display it.
-
-
-In buffered-image mode, jpeg_start_decompress() never performs input and
-thus never suspends.  An application that uses input suspension with
-buffered-image mode must be prepared for suspension returns from these
-routines:
-* jpeg_start_output() performs input only if you request 2-pass quantization
-  and the target scan isn't fully read yet.  (This is discussed below.)
-* jpeg_read_scanlines(), as always, returns the number of scanlines that it
-  was able to produce before suspending.
-* jpeg_finish_output() will read any markers following the target scan,
-  up to the end of the file or the SOS marker that begins another scan.
-  (But it reads no input if jpeg_consume_input() has already reached the
-  end of the file or a SOS marker beyond the target output scan.)
-* jpeg_finish_decompress() will read until the end of file, and thus can
-  suspend if the end hasn't already been reached (as can be tested by
-  calling jpeg_input_complete()).
-jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
-all return TRUE if they completed their tasks, FALSE if they had to suspend.
-In the event of a FALSE return, the application must load more input data
-and repeat the call.  Applications that use non-suspending data sources need
-not check the return values of these three routines.
-
-
-It is possible to change decoding parameters between output passes in the
-buffered-image mode.  The decoder library currently supports only very
-limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
-allowed after jpeg_start_decompress() is called:
-* dct_method can be changed before each call to jpeg_start_output().
-  For example, one could use a fast DCT method for early scans, changing
-  to a higher quality method for the final scan.
-* dither_mode can be changed before each call to jpeg_start_output();
-  of course this has no impact if not using color quantization.  Typically
-  one would use ordered dither for initial passes, then switch to
-  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
-  can cause more memory to be allocated by the library.  Although the amount
-  of memory involved is not large (a scanline or so), it may cause the
-  initial max_memory_to_use specification to be exceeded, which in the worst
-  case would result in an out-of-memory failure.
-* do_block_smoothing can be changed before each call to jpeg_start_output().
-  This setting is relevant only when decoding a progressive JPEG image.
-  During the first DC-only scan, block smoothing provides a very "fuzzy" look
-  instead of the very "blocky" look seen without it; which is better seems a
-  matter of personal taste.  But block smoothing is nearly always a win
-  during later stages, especially when decoding a successive-approximation
-  image: smoothing helps to hide the slight blockiness that otherwise shows
-  up on smooth gradients until the lowest coefficient bits are sent.
-* Color quantization mode can be changed under the rules described below.
-  You *cannot* change between full-color and quantized output (because that
-  would alter the required I/O buffer sizes), but you can change which
-  quantization method is used.
-
-When generating color-quantized output, changing quantization method is a
-very useful way of switching between high-speed and high-quality display.
-The library allows you to change among its three quantization methods:
-1. Single-pass quantization to a fixed color cube.
-   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
-2. Single-pass quantization to an application-supplied colormap.
-   Selected by setting cinfo.colormap to point to the colormap (the value of
-   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
-3. Two-pass quantization to a colormap chosen specifically for the image.
-   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
-   (This is the default setting selected by jpeg_read_header, but it is
-   probably NOT what you want for the first pass of progressive display!)
-These methods offer successively better quality and lesser speed.  However,
-only the first method is available for quantizing in non-RGB color spaces.
-
-IMPORTANT: because the different quantizer methods have very different
-working-storage requirements, the library requires you to indicate which
-one(s) you intend to use before you call jpeg_start_decompress().  (If we did
-not require this, the max_memory_to_use setting would be a complete fiction.)
-You do this by setting one or more of these three cinfo fields to TRUE:
-	enable_1pass_quant		Fixed color cube colormap
-	enable_external_quant		Externally-supplied colormap
-	enable_2pass_quant		Two-pass custom colormap
-All three are initialized FALSE by jpeg_read_header().  But
-jpeg_start_decompress() automatically sets TRUE the one selected by the
-current two_pass_quantize and colormap settings, so you only need to set the
-enable flags for any other quantization methods you plan to change to later.
-
-After setting the enable flags correctly at jpeg_start_decompress() time, you
-can change to any enabled quantization method by setting two_pass_quantize
-and colormap properly just before calling jpeg_start_output().  The following
-special rules apply:
-1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
-   or 2-pass mode from a different mode, or when you want the 2-pass
-   quantizer to be re-run to generate a new colormap.
-2. To switch to an external colormap, or to change to a different external
-   colormap than was used on the prior pass, you must call
-   jpeg_new_colormap() after setting cinfo.colormap.
-NOTE: if you want to use the same colormap as was used in the prior pass,
-you should not do either of these things.  This will save some nontrivial
-switchover costs.
-(These requirements exist because cinfo.colormap will always be non-NULL
-after completing a prior output pass, since both the 1-pass and 2-pass
-quantizers set it to point to their output colormaps.  Thus you have to
-do one of these two things to notify the library that something has changed.
-Yup, it's a bit klugy, but it's necessary to do it this way for backwards
-compatibility.)
-
-Note that in buffered-image mode, the library generates any requested colormap
-during jpeg_start_output(), not during jpeg_start_decompress().
-
-When using two-pass quantization, jpeg_start_output() makes a pass over the
-buffered image to determine the optimum color map; it therefore may take a
-significant amount of time, whereas ordinarily it does little work.  The
-progress monitor hook is called during this pass, if defined.  It is also
-important to realize that if the specified target scan number is greater than
-or equal to the current input scan number, jpeg_start_output() will attempt
-to consume input as it makes this pass.  If you use a suspending data source,
-you need to check for a FALSE return from jpeg_start_output() under these
-conditions.  The combination of 2-pass quantization and a not-yet-fully-read
-target scan is the only case in which jpeg_start_output() will consume input.
-
-
-Application authors who support buffered-image mode may be tempted to use it
-for all JPEG images, even single-scan ones.  This will work, but it is
-inefficient: there is no need to create an image-sized coefficient buffer for
-single-scan images.  Requesting buffered-image mode for such an image wastes
-memory.  Worse, it can cost time on large images, since the buffered data has
-to be swapped out or written to a temporary file.  If you are concerned about
-maximum performance on baseline JPEG files, you should use buffered-image
-mode only when the incoming file actually has multiple scans.  This can be
-tested by calling jpeg_has_multiple_scans(), which will return a correct
-result at any time after jpeg_read_header() completes.
-
-It is also worth noting that when you use jpeg_consume_input() to let input
-processing get ahead of output processing, the resulting pattern of access to
-the coefficient buffer is quite nonsequential.  It's best to use the memory
-manager jmemnobs.c if you can (ie, if you have enough real or virtual main
-memory).  If not, at least make sure that max_memory_to_use is set as high as
-possible.  If the JPEG memory manager has to use a temporary file, you will
-probably see a lot of disk traffic and poor performance.  (This could be
-improved with additional work on the memory manager, but we haven't gotten
-around to it yet.)
-
-In some applications it may be convenient to use jpeg_consume_input() for all
-input processing, including reading the initial markers; that is, you may
-wish to call jpeg_consume_input() instead of jpeg_read_header() during
-startup.  This works, but note that you must check for JPEG_REACHED_SOS and
-JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
-Once the first SOS marker has been reached, you must call
-jpeg_start_decompress() before jpeg_consume_input() will consume more input;
-it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
-tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
-without ever returning JPEG_REACHED_SOS; be sure to check for this case.
-If this happens, the decompressor will not read any more input until you call
-jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
-using buffered-image mode, but in that case it's basically a no-op after the
-initial markers have been read: it will just return JPEG_SUSPENDED.
-
-
-Abbreviated datastreams and multiple images
--------------------------------------------
-
-A JPEG compression or decompression object can be reused to process multiple
-images.  This saves a small amount of time per image by eliminating the
-"create" and "destroy" operations, but that isn't the real purpose of the
-feature.  Rather, reuse of an object provides support for abbreviated JPEG
-datastreams.  Object reuse can also simplify processing a series of images in
-a single input or output file.  This section explains these features.
-
-A JPEG file normally contains several hundred bytes worth of quantization
-and Huffman tables.  In a situation where many images will be stored or
-transmitted with identical tables, this may represent an annoying overhead.
-The JPEG standard therefore permits tables to be omitted.  The standard
-defines three classes of JPEG datastreams:
-  * "Interchange" datastreams contain an image and all tables needed to decode
-     the image.  These are the usual kind of JPEG file.
-  * "Abbreviated image" datastreams contain an image, but are missing some or
-    all of the tables needed to decode that image.
-  * "Abbreviated table specification" (henceforth "tables-only") datastreams
-    contain only table specifications.
-To decode an abbreviated image, it is necessary to load the missing table(s)
-into the decoder beforehand.  This can be accomplished by reading a separate
-tables-only file.  A variant scheme uses a series of images in which the first
-image is an interchange (complete) datastream, while subsequent ones are
-abbreviated and rely on the tables loaded by the first image.  It is assumed
-that once the decoder has read a table, it will remember that table until a
-new definition for the same table number is encountered.
-
-It is the application designer's responsibility to figure out how to associate
-the correct tables with an abbreviated image.  While abbreviated datastreams
-can be useful in a closed environment, their use is strongly discouraged in
-any situation where data exchange with other applications might be needed.
-Caveat designer.
-
-The JPEG library provides support for reading and writing any combination of
-tables-only datastreams and abbreviated images.  In both compression and
-decompression objects, a quantization or Huffman table will be retained for
-the lifetime of the object, unless it is overwritten by a new table definition.
-
-
-To create abbreviated image datastreams, it is only necessary to tell the
-compressor not to emit some or all of the tables it is using.  Each
-quantization and Huffman table struct contains a boolean field "sent_table",
-which normally is initialized to FALSE.  For each table used by the image, the
-header-writing process emits the table and sets sent_table = TRUE unless it is
-already TRUE.  (In normal usage, this prevents outputting the same table
-definition multiple times, as would otherwise occur because the chroma
-components typically share tables.)  Thus, setting this field to TRUE before
-calling jpeg_start_compress() will prevent the table from being written at
-all.
-
-If you want to create a "pure" abbreviated image file containing no tables,
-just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
-tables.  If you want to emit some but not all tables, you'll need to set the
-individual sent_table fields directly.
-
-To create an abbreviated image, you must also call jpeg_start_compress()
-with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
-will force all the sent_table fields to FALSE.  (This is a safety feature to
-prevent abbreviated images from being created accidentally.)
-
-To create a tables-only file, perform the same parameter setup that you
-normally would, but instead of calling jpeg_start_compress() and so on, call
-jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
-containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
-and Huffman tables that are currently defined in the compression object will
-be emitted unless their sent_tables flag is already TRUE, and then all the
-sent_tables flags will be set TRUE.
-
-A sure-fire way to create matching tables-only and abbreviated image files
-is to proceed as follows:
-
-	create JPEG compression object
-	set JPEG parameters
-	set destination to tables-only file
-	jpeg_write_tables(&cinfo);
-	set destination to image file
-	jpeg_start_compress(&cinfo, FALSE);
-	write data...
-	jpeg_finish_compress(&cinfo);
-
-Since the JPEG parameters are not altered between writing the table file and
-the abbreviated image file, the same tables are sure to be used.  Of course,
-you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
-many times to produce many abbreviated image files matching the table file.
-
-You cannot suppress output of the computed Huffman tables when Huffman
-optimization is selected.  (If you could, there'd be no way to decode the
-image...)  Generally, you don't want to set optimize_coding = TRUE when
-you are trying to produce abbreviated files.
-
-In some cases you might want to compress an image using tables which are
-not stored in the application, but are defined in an interchange or
-tables-only file readable by the application.  This can be done by setting up
-a JPEG decompression object to read the specification file, then copying the
-tables into your compression object.  See jpeg_copy_critical_parameters()
-for an example of copying quantization tables.
-
-
-To read abbreviated image files, you simply need to load the proper tables
-into the decompression object before trying to read the abbreviated image.
-If the proper tables are stored in the application program, you can just
-allocate the table structs and fill in their contents directly.  For example,
-to load a fixed quantization table into table slot "n":
-
-    if (cinfo.quant_tbl_ptrs[n] == NULL)
-      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
-    quant_ptr = cinfo.quant_tbl_ptrs[n];	/* quant_ptr is JQUANT_TBL* */
-    for (i = 0; i < 64; i++) {
-      /* Qtable[] is desired quantization table, in natural array order */
-      quant_ptr->quantval[i] = Qtable[i];
-    }
-
-Code to load a fixed Huffman table is typically (for AC table "n"):
-
-    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
-      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
-    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];	/* huff_ptr is JHUFF_TBL* */
-    for (i = 1; i <= 16; i++) {
-      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
-      huff_ptr->bits[i] = counts[i];
-    }
-    for (i = 0; i < 256; i++) {
-      /* symbols[] is the list of Huffman symbols, in code-length order */
-      huff_ptr->huffval[i] = symbols[i];
-    }
-
-(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
-constant JQUANT_TBL object is not safe.  If the incoming file happened to
-contain a quantization table definition, your master table would get
-overwritten!  Instead allocate a working table copy and copy the master table
-into it, as illustrated above.  Ditto for Huffman tables, of course.)
-
-You might want to read the tables from a tables-only file, rather than
-hard-wiring them into your application.  The jpeg_read_header() call is
-sufficient to read a tables-only file.  You must pass a second parameter of
-FALSE to indicate that you do not require an image to be present.  Thus, the
-typical scenario is
-
-	create JPEG decompression object
-	set source to tables-only file
-	jpeg_read_header(&cinfo, FALSE);
-	set source to abbreviated image file
-	jpeg_read_header(&cinfo, TRUE);
-	set decompression parameters
-	jpeg_start_decompress(&cinfo);
-	read data...
-	jpeg_finish_decompress(&cinfo);
-
-In some cases, you may want to read a file without knowing whether it contains
-an image or just tables.  In that case, pass FALSE and check the return value
-from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
-JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
-JPEG_SUSPENDED, is possible when using a suspending data source manager.)
-Note that jpeg_read_header() will not complain if you read an abbreviated
-image for which you haven't loaded the missing tables; the missing-table check
-occurs later, in jpeg_start_decompress().
-
-
-It is possible to read a series of images from a single source file by
-repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
-without releasing/recreating the JPEG object or the data source module.
-(If you did reinitialize, any partial bufferload left in the data source
-buffer at the end of one image would be discarded, causing you to lose the
-start of the next image.)  When you use this method, stored tables are
-automatically carried forward, so some of the images can be abbreviated images
-that depend on tables from earlier images.
-
-If you intend to write a series of images into a single destination file,
-you might want to make a specialized data destination module that doesn't
-flush the output buffer at term_destination() time.  This would speed things
-up by some trifling amount.  Of course, you'd need to remember to flush the
-buffer after the last image.  You can make the later images be abbreviated
-ones by passing FALSE to jpeg_start_compress().
-
-
-Special markers
----------------
-
-Some applications may need to insert or extract special data in the JPEG
-datastream.  The JPEG standard provides marker types "COM" (comment) and
-"APP0" through "APP15" (application) to hold application-specific data.
-Unfortunately, the use of these markers is not specified by the standard.
-COM markers are fairly widely used to hold user-supplied text.  The JFIF file
-format spec uses APP0 markers with specified initial strings to hold certain
-data.  Adobe applications use APP14 markers beginning with the string "Adobe"
-for miscellaneous data.  Other APPn markers are rarely seen, but might
-contain almost anything.
-
-If you wish to store user-supplied text, we recommend you use COM markers
-and place readable 7-bit ASCII text in them.  Newline conventions are not
-standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
-(Mac style).  A robust COM reader should be able to cope with random binary
-garbage, including nulls, since some applications generate COM markers
-containing non-ASCII junk.  (But yours should not be one of them.)
-
-For program-supplied data, use an APPn marker, and be sure to begin it with an
-identifying string so that you can tell whether the marker is actually yours.
-It's probably best to avoid using APP0 or APP14 for any private markers.
-(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
-not use APP8 markers for any private purposes, either.)
-
-Keep in mind that at most 65533 bytes can be put into one marker, but you
-can have as many markers as you like.
-
-By default, the IJG compression library will write a JFIF APP0 marker if the
-selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
-the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
-we don't recommend it.  The decompression library will recognize JFIF and
-Adobe markers and will set the JPEG colorspace properly when one is found.
-
-
-You can write special markers immediately following the datastream header by
-calling jpeg_write_marker() after jpeg_start_compress() and before the first
-call to jpeg_write_scanlines().  When you do this, the markers appear after
-the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
-all else.  Specify the marker type parameter as "JPEG_COM" for COM or
-"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
-any marker type, but we don't recommend writing any other kinds of marker.)
-For example, to write a user comment string pointed to by comment_text:
-	jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
-
-If it's not convenient to store all the marker data in memory at once,
-you can instead call jpeg_write_m_header() followed by multiple calls to
-jpeg_write_m_byte().  If you do it this way, it's your responsibility to
-call jpeg_write_m_byte() exactly the number of times given in the length
-parameter to jpeg_write_m_header().  (This method lets you empty the
-output buffer partway through a marker, which might be important when
-using a suspending data destination module.  In any case, if you are using
-a suspending destination, you should flush its buffer after inserting
-any special markers.  See "I/O suspension".)
-
-Or, if you prefer to synthesize the marker byte sequence yourself,
-you can just cram it straight into the data destination module.
-
-If you are writing JFIF 1.02 extension markers (thumbnail images), don't
-forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
-correct JFIF version number in the JFIF header marker.  The library's default
-is to write version 1.01, but that's wrong if you insert any 1.02 extension
-markers.  (We could probably get away with just defaulting to 1.02, but there
-used to be broken decoders that would complain about unknown minor version
-numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
-you are actually using 1.02 extensions.)
-
-
-When reading, two methods of handling special markers are available:
-1. You can ask the library to save the contents of COM and/or APPn markers
-into memory, and then examine them at your leisure afterwards.
-2. You can supply your own routine to process COM and/or APPn markers
-on-the-fly as they are read.
-The first method is simpler to use, especially if you are using a suspending
-data source; writing a marker processor that copes with input suspension is
-not easy (consider what happens if the marker is longer than your available
-input buffer).  However, the second method conserves memory since the marker
-data need not be kept around after it's been processed.
-
-For either method, you'd normally set up marker handling after creating a
-decompression object and before calling jpeg_read_header(), because the
-markers of interest will typically be near the head of the file and so will
-be scanned by jpeg_read_header.  Once you've established a marker handling
-method, it will be used for the life of that decompression object
-(potentially many datastreams), unless you change it.  Marker handling is
-determined separately for COM markers and for each APPn marker code.
-
-
-To save the contents of special markers in memory, call
-	jpeg_save_markers(cinfo, marker_code, length_limit)
-where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
-(To arrange to save all the special marker types, you need to call this
-routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
-than length_limit data bytes, only length_limit bytes will be saved; this
-parameter allows you to avoid chewing up memory when you only need to see the
-first few bytes of a potentially large marker.  If you want to save all the
-data, set length_limit to 0xFFFF; that is enough since marker lengths are only
-16 bits.  As a special case, setting length_limit to 0 prevents that marker
-type from being saved at all.  (That is the default behavior, in fact.)
-
-After jpeg_read_header() completes, you can examine the special markers by
-following the cinfo->marker_list pointer chain.  All the special markers in
-the file appear in this list, in order of their occurrence in the file (but
-omitting any markers of types you didn't ask for).  Both the original data
-length and the saved data length are recorded for each list entry; the latter
-will not exceed length_limit for the particular marker type.  Note that these
-lengths exclude the marker length word, whereas the stored representation
-within the JPEG file includes it.  (Hence the maximum data length is really
-only 65533.)
-
-It is possible that additional special markers appear in the file beyond the
-SOS marker at which jpeg_read_header stops; if so, the marker list will be
-extended during reading of the rest of the file.  This is not expected to be
-common, however.  If you are short on memory you may want to reset the length
-limit to zero for all marker types after finishing jpeg_read_header, to
-ensure that the max_memory_to_use setting cannot be exceeded due to addition
-of later markers.
-
-The marker list remains stored until you call jpeg_finish_decompress or
-jpeg_abort, at which point the memory is freed and the list is set to empty.
-(jpeg_destroy also releases the storage, of course.)
-
-Note that the library is internally interested in APP0 and APP14 markers;
-if you try to set a small nonzero length limit on these types, the library
-will silently force the length up to the minimum it wants.  (But you can set
-a zero length limit to prevent them from being saved at all.)  Also, in a
-16-bit environment, the maximum length limit may be constrained to less than
-65533 by malloc() limitations.  It is therefore best not to assume that the
-effective length limit is exactly what you set it to be.
-
-
-If you want to supply your own marker-reading routine, you do it by calling
-jpeg_set_marker_processor().  A marker processor routine must have the
-signature
-	boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
-Although the marker code is not explicitly passed, the routine can find it
-in cinfo->unread_marker.  At the time of call, the marker proper has been
-read from the data source module.  The processor routine is responsible for
-reading the marker length word and the remaining parameter bytes, if any.
-Return TRUE to indicate success.  (FALSE should be returned only if you are
-using a suspending data source and it tells you to suspend.  See the standard
-marker processors in jdmarker.c for appropriate coding methods if you need to
-use a suspending data source.)
-
-If you override the default APP0 or APP14 processors, it is up to you to
-recognize JFIF and Adobe markers if you want colorspace recognition to occur
-properly.  We recommend copying and extending the default processors if you
-want to do that.  (A better idea is to save these marker types for later
-examination by calling jpeg_save_markers(); that method doesn't interfere
-with the library's own processing of these markers.)
-
-jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
---- if you call one it overrides any previous call to the other, for the
-particular marker type specified.
-
-A simple example of an external COM processor can be found in djpeg.c.
-Also, see jpegtran.c for an example of using jpeg_save_markers.
-
-
-Raw (downsampled) image data
-----------------------------
-
-Some applications need to supply already-downsampled image data to the JPEG
-compressor, or to receive raw downsampled data from the decompressor.  The
-library supports this requirement by allowing the application to write or
-read raw data, bypassing the normal preprocessing or postprocessing steps.
-The interface is different from the standard one and is somewhat harder to
-use.  If your interest is merely in bypassing color conversion, we recommend
-that you use the standard interface and simply set jpeg_color_space =
-in_color_space (or jpeg_color_space = out_color_space for decompression).
-The mechanism described in this section is necessary only to supply or
-receive downsampled image data, in which not all components have the same
-dimensions.
-
-
-To compress raw data, you must supply the data in the colorspace to be used
-in the JPEG file (please read the earlier section on Special color spaces)
-and downsampled to the sampling factors specified in the JPEG parameters.
-You must supply the data in the format used internally by the JPEG library,
-namely a JSAMPIMAGE array.  This is an array of pointers to two-dimensional
-arrays, each of type JSAMPARRAY.  Each 2-D array holds the values for one
-color component.  This structure is necessary since the components are of
-different sizes.  If the image dimensions are not a multiple of the MCU size,
-you must also pad the data correctly (usually, this is done by replicating
-the last column and/or row).  The data must be padded to a multiple of a DCT
-block in each component: that is, each downsampled row must contain a
-multiple of 8 valid samples, and there must be a multiple of 8 sample rows
-for each component.  (For applications such as conversion of digital TV
-images, the standard image size is usually a multiple of the DCT block size,
-so that no padding need actually be done.)
-
-The procedure for compression of raw data is basically the same as normal
-compression, except that you call jpeg_write_raw_data() in place of
-jpeg_write_scanlines().  Before calling jpeg_start_compress(), you must do
-the following:
-  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
-    This notifies the library that you will be supplying raw data.
-  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
-    call is a good idea.  Note that since color conversion is bypassed,
-    in_color_space is ignored, except that jpeg_set_defaults() uses it to
-    choose the default jpeg_color_space setting.
-  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
-    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
-    dimensions of the data you are supplying, it's wise to set them
-    explicitly, rather than assuming the library's defaults are what you want.
-
-To pass raw data to the library, call jpeg_write_raw_data() in place of
-jpeg_write_scanlines().  The two routines work similarly except that
-jpeg_write_raw_data takes a JSAMPIMAGE data array rather than JSAMPARRAY.
-The scanlines count passed to and returned from jpeg_write_raw_data is
-measured in terms of the component with the largest v_samp_factor.
-
-jpeg_write_raw_data() processes one MCU row per call, which is to say
-v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
-value must be at least max_v_samp_factor*DCTSIZE, and the return value will
-be exactly that amount (or possibly some multiple of that amount, in future
-library versions).  This is true even on the last call at the bottom of the
-image; don't forget to pad your data as necessary.
-
-The required dimensions of the supplied data can be computed for each
-component as
-	cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
-	cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
-after jpeg_start_compress() has initialized those fields.  If the valid data
-is smaller than this, it must be padded appropriately.  For some sampling
-factors and image sizes, additional dummy DCT blocks are inserted to make
-the image a multiple of the MCU dimensions.  The library creates such dummy
-blocks itself; it does not read them from your supplied data.  Therefore you
-need never pad by more than DCTSIZE samples.  An example may help here.
-Assume 2h2v downsampling of YCbCr data, that is
-	cinfo->comp_info[0].h_samp_factor = 2		for Y
-	cinfo->comp_info[0].v_samp_factor = 2
-	cinfo->comp_info[1].h_samp_factor = 1		for Cb
-	cinfo->comp_info[1].v_samp_factor = 1
-	cinfo->comp_info[2].h_samp_factor = 1		for Cr
-	cinfo->comp_info[2].v_samp_factor = 1
-and suppose that the nominal image dimensions (cinfo->image_width and
-cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
-compute downsampled_width = 101 and width_in_blocks = 13 for Y,
-downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
-for the height fields).  You must pad the Y data to at least 13*8 = 104
-columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
-MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
-scanlines on each call to jpeg_write_raw_data(), which is to say 16 actual
-sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
-so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
-of Y data is dummy, so it doesn't matter what you pass for it in the data
-arrays, but the scanlines count must total up to 112 so that all of the Cb
-and Cr data gets passed.
-
-Output suspension is supported with raw-data compression: if the data
-destination module suspends, jpeg_write_raw_data() will return 0.
-In this case the same data rows must be passed again on the next call.
-
-
-Decompression with raw data output implies bypassing all postprocessing:
-you cannot ask for rescaling or color quantization, for instance.  More
-seriously, you must deal with the color space and sampling factors present in
-the incoming file.  If your application only handles, say, 2h1v YCbCr data,
-you must check for and fail on other color spaces or other sampling factors.
-The library will not convert to a different color space for you.
-
-To obtain raw data output, set cinfo->raw_data_out = TRUE before
-jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
-verify that the color space and sampling factors are ones you can handle.
-Then call jpeg_read_raw_data() in place of jpeg_read_scanlines().  The
-decompression process is otherwise the same as usual.
-
-jpeg_read_raw_data() returns one MCU row per call, and thus you must pass a
-buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
-the same as for raw-data compression).  The buffer you pass must be large
-enough to hold the actual data plus padding to DCT-block boundaries.  As with
-compression, any entirely dummy DCT blocks are not processed so you need not
-allocate space for them, but the total scanline count includes them.  The
-above example of computing buffer dimensions for raw-data compression is
-equally valid for decompression.
-
-Input suspension is supported with raw-data decompression: if the data source
-module suspends, jpeg_read_raw_data() will return 0.  You can also use
-buffered-image mode to read raw data in multiple passes.
-
-
-Really raw data: DCT coefficients
----------------------------------
-
-It is possible to read or write the contents of a JPEG file as raw DCT
-coefficients.  This facility is mainly intended for use in lossless
-transcoding between different JPEG file formats.  Other possible applications
-include lossless cropping of a JPEG image, lossless reassembly of a
-multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
-
-To read the contents of a JPEG file as DCT coefficients, open the file and do
-jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
-and jpeg_read_scanlines(), call jpeg_read_coefficients().  This will read the
-entire image into a set of virtual coefficient-block arrays, one array per
-component.  The return value is a pointer to an array of virtual-array
-descriptors.  Each virtual array can be accessed directly using the JPEG
-memory manager's access_virt_barray method (see Memory management, below,
-and also read structure.doc's discussion of virtual array handling).  Or,
-for simple transcoding to a different JPEG file format, the array list can
-just be handed directly to jpeg_write_coefficients().
-
-Each block in the block arrays contains quantized coefficient values in
-normal array order (not JPEG zigzag order).  The block arrays contain only
-DCT blocks containing real data; any entirely-dummy blocks added to fill out
-interleaved MCUs at the right or bottom edges of the image are discarded
-during reading and are not stored in the block arrays.  (The size of each
-block array can be determined from the width_in_blocks and height_in_blocks
-fields of the component's comp_info entry.)  This is also the data format
-expected by jpeg_write_coefficients().
-
-When you are done using the virtual arrays, call jpeg_finish_decompress()
-to release the array storage and return the decompression object to an idle
-state; or just call jpeg_destroy() if you don't need to reuse the object.
-
-If you use a suspending data source, jpeg_read_coefficients() will return
-NULL if it is forced to suspend; a non-NULL return value indicates successful
-completion.  You need not test for a NULL return value when using a
-non-suspending data source.
-
-It is also possible to call jpeg_read_coefficients() to obtain access to the
-decoder's coefficient arrays during a normal decode cycle in buffered-image
-mode.  This frammish might be useful for progressively displaying an incoming
-image and then re-encoding it without loss.  To do this, decode in buffered-
-image mode as discussed previously, then call jpeg_read_coefficients() after
-the last jpeg_finish_output() call.  The arrays will be available for your use
-until you call jpeg_finish_decompress().
-
-
-To write the contents of a JPEG file as DCT coefficients, you must provide
-the DCT coefficients stored in virtual block arrays.  You can either pass
-block arrays read from an input JPEG file by jpeg_read_coefficients(), or
-allocate virtual arrays from the JPEG compression object and fill them
-yourself.  In either case, jpeg_write_coefficients() is substituted for
-jpeg_start_compress() and jpeg_write_scanlines().  Thus the sequence is
-  * Create compression object
-  * Set all compression parameters as necessary
-  * Request virtual arrays if needed
-  * jpeg_write_coefficients()
-  * jpeg_finish_compress()
-  * Destroy or re-use compression object
-jpeg_write_coefficients() is passed a pointer to an array of virtual block
-array descriptors; the number of arrays is equal to cinfo.num_components.
-
-The virtual arrays need only have been requested, not realized, before
-jpeg_write_coefficients() is called.  A side-effect of
-jpeg_write_coefficients() is to realize any virtual arrays that have been
-requested from the compression object's memory manager.  Thus, when obtaining
-the virtual arrays from the compression object, you should fill the arrays
-after calling jpeg_write_coefficients().  The data is actually written out
-when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
-the file header.
-
-When writing raw DCT coefficients, it is crucial that the JPEG quantization
-tables and sampling factors match the way the data was encoded, or the
-resulting file will be invalid.  For transcoding from an existing JPEG file,
-we recommend using jpeg_copy_critical_parameters().  This routine initializes
-all the compression parameters to default values (like jpeg_set_defaults()),
-then copies the critical information from a source decompression object.
-The decompression object should have just been used to read the entire
-JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
-
-jpeg_write_coefficients() marks all tables stored in the compression object
-as needing to be written to the output file (thus, it acts like
-jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
-emitting abbreviated JPEG files by accident.  If you really want to emit an
-abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
-individual sent_table flags, between calling jpeg_write_coefficients() and
-jpeg_finish_compress().
-
-
-Progress monitoring
--------------------
-
-Some applications may need to regain control from the JPEG library every so
-often.  The typical use of this feature is to produce a percent-done bar or
-other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
-Although you do get control back frequently during the data-transferring pass
-(the jpeg_read_scanlines or jpeg_write_scanlines loop), any additional passes
-will occur inside jpeg_finish_compress or jpeg_start_decompress; those
-routines may take a long time to execute, and you don't get control back
-until they are done.
-
-You can define a progress-monitor routine which will be called periodically
-by the library.  No guarantees are made about how often this call will occur,
-so we don't recommend you use it for mouse tracking or anything like that.
-At present, a call will occur once per MCU row, scanline, or sample row
-group, whichever unit is convenient for the current processing mode; so the
-wider the image, the longer the time between calls.  During the data
-transferring pass, only one call occurs per call of jpeg_read_scanlines or
-jpeg_write_scanlines, so don't pass a large number of scanlines at once if
-you want fine resolution in the progress count.  (If you really need to use
-the callback mechanism for time-critical tasks like mouse tracking, you could
-insert additional calls inside some of the library's inner loops.)
-
-To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
-fill in its progress_monitor field with a pointer to your callback routine,
-and set cinfo->progress to point to the struct.  The callback will be called
-whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
-jpeg_create_compress or jpeg_create_decompress; the library will not change
-it thereafter.  So if you allocate dynamic storage for the progress struct,
-make sure it will live as long as the JPEG object does.  Allocating from the
-JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
-can use the same callback routine for both compression and decompression.
-
-The jpeg_progress_mgr struct contains four fields which are set by the library:
-	long pass_counter;	/* work units completed in this pass */
-	long pass_limit;	/* total number of work units in this pass */
-	int completed_passes;	/* passes completed so far */
-	int total_passes;	/* total number of passes expected */
-During any one pass, pass_counter increases from 0 up to (not including)
-pass_limit; the step size is usually but not necessarily 1.  The pass_limit
-value may change from one pass to another.  The expected total number of
-passes is in total_passes, and the number of passes already completed is in
-completed_passes.  Thus the fraction of work completed may be estimated as
-		completed_passes + (pass_counter/pass_limit)
-		--------------------------------------------
-				total_passes
-ignoring the fact that the passes may not be equal amounts of work.
-
-When decompressing, pass_limit can even change within a pass, because it
-depends on the number of scans in the JPEG file, which isn't always known in
-advance.  The computed fraction-of-work-done may jump suddenly (if the library
-discovers it has overestimated the number of scans) or even decrease (in the
-opposite case).  It is not wise to put great faith in the work estimate.
-
-When using the decompressor's buffered-image mode, the progress monitor work
-estimate is likely to be completely unhelpful, because the library has no way
-to know how many output passes will be demanded of it.  Currently, the library
-sets total_passes based on the assumption that there will be one more output
-pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
-TRUE), but no more output passes if the file end has been reached when the
-output pass is started.  This means that total_passes will rise as additional
-output passes are requested.  If you have a way of determining the input file
-size, estimating progress based on the fraction of the file that's been read
-will probably be more useful than using the library's value.
-
-
-Memory management
------------------
-
-This section covers some key facts about the JPEG library's built-in memory
-manager.  For more info, please read structure.doc's section about the memory
-manager, and consult the source code if necessary.
-
-All memory and temporary file allocation within the library is done via the
-memory manager.  If necessary, you can replace the "back end" of the memory
-manager to control allocation yourself (for example, if you don't want the
-library to use malloc() and free() for some reason).
-
-Some data is allocated "permanently" and will not be freed until the JPEG
-object is destroyed.  Most data is allocated "per image" and is freed by
-jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
-memory manager yourself to allocate structures that will automatically be
-freed at these times.  Typical code for this is
-  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, size);
-Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
-Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
-There are also alloc_sarray and alloc_barray routines that automatically
-build 2-D sample or block arrays.
-
-The library's minimum space requirements to process an image depend on the
-image's width, but not on its height, because the library ordinarily works
-with "strip" buffers that are as wide as the image but just a few rows high.
-Some operating modes (eg, two-pass color quantization) require full-image
-buffers.  Such buffers are treated as "virtual arrays": only the current strip
-need be in memory, and the rest can be swapped out to a temporary file.
-
-If you use the simplest memory manager back end (jmemnobs.c), then no
-temporary files are used; virtual arrays are simply malloc()'d.  Images bigger
-than memory can be processed only if your system supports virtual memory.
-The other memory manager back ends support temporary files of various flavors
-and thus work in machines without virtual memory.  They may also be useful on
-Unix machines if you need to process images that exceed available swap space.
-
-When using temporary files, the library will make the in-memory buffers for
-its virtual arrays just big enough to stay within a "maximum memory" setting.
-Your application can set this limit by setting cinfo->mem->max_memory_to_use
-after creating the JPEG object.  (Of course, there is still a minimum size for
-the buffers, so the max-memory setting is effective only if it is bigger than
-the minimum space needed.)  If you allocate any large structures yourself, you
-must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
-order to have them counted against the max memory limit.  Also keep in mind
-that space allocated with alloc_small() is ignored, on the assumption that
-it's too small to be worth worrying about; so a reasonable safety margin
-should be left when setting max_memory_to_use.
-
-If you use the jmemname.c or jmemdos.c memory manager back end, it is
-important to clean up the JPEG object properly to ensure that the temporary
-files get deleted.  (This is especially crucial with jmemdos.c, where the
-"temporary files" may be extended-memory segments; if they are not freed,
-DOS will require a reboot to recover the memory.)  Thus, with these memory
-managers, it's a good idea to provide a signal handler that will trap any
-early exit from your program.  The handler should call either jpeg_abort()
-or jpeg_destroy() for any active JPEG objects.  A handler is not needed with
-jmemnobs.c, and shouldn't be necessary with jmemansi.c or jmemmac.c either,
-since the C library is supposed to take care of deleting files made with
-tmpfile().
-
-
-Memory usage
-------------
-
-Working memory requirements while performing compression or decompression
-depend on image dimensions, image characteristics (such as colorspace and
-JPEG process), and operating mode (application-selected options).
-
-As of v6b, the decompressor requires:
- 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
-    on operating mode and image characteristics (particularly color vs.
-    grayscale), but it doesn't depend on image dimensions.
- 2. Strip buffers (of size proportional to the image width) for IDCT and
-    upsampling results.  The worst case for commonly used sampling factors
-    is about 34 bytes * width in pixels for a color image.  A grayscale image
-    only needs about 8 bytes per pixel column.
- 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
-    file (including progressive JPEGs), or whenever you select buffered-image
-    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
-    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
-    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
- 4. To perform 2-pass color quantization, the decompressor also needs a
-    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
-This does not count any memory allocated by the application, such as a
-buffer to hold the final output image.
-
-The above figures are valid for 8-bit JPEG data precision and a machine with
-32-bit ints.  For 12-bit JPEG data, double the size of the strip buffers and
-quantization pixel buffer.  The "fixed-size" data will be somewhat smaller
-with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
-color spaces will require different amounts of space.
-
-The full-image coefficient and pixel buffers, if needed at all, do not
-have to be fully RAM resident; you can have the library use temporary
-files instead when the total memory usage would exceed a limit you set.
-(But if your OS supports virtual memory, it's probably better to just use
-jmemnobs and let the OS do the swapping.)
-
-The compressor's memory requirements are similar, except that it has no need
-for color quantization.  Also, it needs a full-image DCT coefficient buffer
-if Huffman-table optimization is asked for, even if progressive mode is not
-requested.
-
-If you need more detailed information about memory usage in a particular
-situation, you can enable the MEM_STATS code in jmemmgr.c.
-
-
-Library compile-time options
-----------------------------
-
-A number of compile-time options are available by modifying jmorecfg.h.
-
-The JPEG standard provides for both the baseline 8-bit DCT process and
-a 12-bit DCT process.  The IJG code supports 12-bit lossy JPEG if you define
-BITS_IN_JSAMPLE as 12 rather than 8.  Note that this causes JSAMPLE to be
-larger than a char, so it affects the surrounding application's image data.
-The sample applications cjpeg and djpeg can support 12-bit mode only for PPM
-and GIF file formats; you must disable the other file formats to compile a
-12-bit cjpeg or djpeg.  (install.doc has more information about that.)
-At present, a 12-bit library can handle *only* 12-bit images, not both
-precisions.  (If you need to include both 8- and 12-bit libraries in a single
-application, you could probably do it by defining NEED_SHORT_EXTERNAL_NAMES
-for just one of the copies.  You'd have to access the 8-bit and 12-bit copies
-from separate application source files.  This is untested ... if you try it,
-we'd like to hear whether it works!)
-
-Note that a 12-bit library always compresses in Huffman optimization mode,
-in order to generate valid Huffman tables.  This is necessary because our
-default Huffman tables only cover 8-bit data.  If you need to output 12-bit
-files in one pass, you'll have to supply suitable default Huffman tables.
-You may also want to supply your own DCT quantization tables; the existing
-quality-scaling code has been developed for 8-bit use, and probably doesn't
-generate especially good tables for 12-bit.
-
-The maximum number of components (color channels) in the image is determined
-by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
-expect that few applications will need more than four or so.
-
-On machines with unusual data type sizes, you may be able to improve
-performance or reduce memory space by tweaking the various typedefs in
-jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
-is quite slow; consider trading memory for speed by making JCOEF, INT16, and
-UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
-You probably don't want to make JSAMPLE be int unless you have lots of memory
-to burn.
-
-You can reduce the size of the library by compiling out various optional
-functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
-
-You can also save a few K by not having text error messages in the library;
-the standard error message table occupies about 5Kb.  This is particularly
-reasonable for embedded applications where there's no good way to display 
-a message anyway.  To do this, remove the creation of the message table
-(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
-something reasonable without it.  You could output the numeric value of the
-message code number, for example.  If you do this, you can also save a couple
-more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
-you don't need trace capability anyway, right?
-
-
-Portability considerations
---------------------------
-
-The JPEG library has been written to be extremely portable; the sample
-applications cjpeg and djpeg are slightly less so.  This section summarizes
-the design goals in this area.  (If you encounter any bugs that cause the
-library to be less portable than is claimed here, we'd appreciate hearing
-about them.)
-
-The code works fine on ANSI C, C++, and pre-ANSI C compilers, using any of
-the popular system include file setups, and some not-so-popular ones too.
-See install.doc for configuration procedures.
-
-The code is not dependent on the exact sizes of the C data types.  As
-distributed, we make the assumptions that
-	char	is at least 8 bits wide
-	short	is at least 16 bits wide
-	int	is at least 16 bits wide
-	long	is at least 32 bits wide
-(These are the minimum requirements of the ANSI C standard.)  Wider types will
-work fine, although memory may be used inefficiently if char is much larger
-than 8 bits or short is much bigger than 16 bits.  The code should work
-equally well with 16- or 32-bit ints.
-
-In a system where these assumptions are not met, you may be able to make the
-code work by modifying the typedefs in jmorecfg.h.  However, you will probably
-have difficulty if int is less than 16 bits wide, since references to plain
-int abound in the code.
-
-char can be either signed or unsigned, although the code runs faster if an
-unsigned char type is available.  If char is wider than 8 bits, you will need
-to redefine JOCTET and/or provide custom data source/destination managers so
-that JOCTET represents exactly 8 bits of data on external storage.
-
-The JPEG library proper does not assume ASCII representation of characters.
-But some of the image file I/O modules in cjpeg/djpeg do have ASCII
-dependencies in file-header manipulation; so does cjpeg's select_file_type()
-routine.
-
-The JPEG library does not rely heavily on the C library.  In particular, C
-stdio is used only by the data source/destination modules and the error
-handler, all of which are application-replaceable.  (cjpeg/djpeg are more
-heavily dependent on stdio.)  malloc and free are called only from the memory
-manager "back end" module, so you can use a different memory allocator by
-replacing that one file.
-
-The code generally assumes that C names must be unique in the first 15
-characters.  However, global function names can be made unique in the
-first 6 characters by defining NEED_SHORT_EXTERNAL_NAMES.
-
-More info about porting the code may be gleaned by reading jconfig.doc,
-jmorecfg.h, and jinclude.h.
-
-
-Notes for MS-DOS implementors
------------------------------
-
-The IJG code is designed to work efficiently in 80x86 "small" or "medium"
-memory models (i.e., data pointers are 16 bits unless explicitly declared
-"far"; code pointers can be either size).  You may be able to use small
-model to compile cjpeg or djpeg by itself, but you will probably have to use
-medium model for any larger application.  This won't make much difference in
-performance.  You *will* take a noticeable performance hit if you use a
-large-data memory model (perhaps 10%-25%), and you should avoid "huge" model
-if at all possible.
-
-The JPEG library typically needs 2Kb-3Kb of stack space.  It will also
-malloc about 20K-30K of near heap space while executing (and lots of far
-heap, but that doesn't count in this calculation).  This figure will vary
-depending on selected operating mode, and to a lesser extent on image size.
-There is also about 5Kb-6Kb of constant data which will be allocated in the
-near data segment (about 4Kb of this is the error message table).
-Thus you have perhaps 20K available for other modules' static data and near
-heap space before you need to go to a larger memory model.  The C library's
-static data will account for several K of this, but that still leaves a good
-deal for your needs.  (If you are tight on space, you could reduce the sizes
-of the I/O buffers allocated by jdatasrc.c and jdatadst.c, say from 4K to
-1K.  Another possibility is to move the error message table to far memory;
-this should be doable with only localized hacking on jerror.c.)
-
-About 2K of the near heap space is "permanent" memory that will not be
-released until you destroy the JPEG object.  This is only an issue if you
-save a JPEG object between compression or decompression operations.
-
-Far data space may also be a tight resource when you are dealing with large
-images.  The most memory-intensive case is decompression with two-pass color
-quantization, or single-pass quantization to an externally supplied color
-map.  This requires a 128Kb color lookup table plus strip buffers amounting
-to about 40 bytes per column for typical sampling ratios (eg, about 25600
-bytes for a 640-pixel-wide image).  You may not be able to process wide
-images if you have large data structures of your own.
-
-Of course, all of these concerns vanish if you use a 32-bit flat-memory-model
-compiler, such as DJGPP or Watcom C.  We highly recommend flat model if you
-can use it; the JPEG library is significantly faster in flat model.
diff --git a/jpeg/libjpeg.txt b/jpeg/libjpeg.txt
new file mode 100644
index 000000000..e5a85c0e3
--- /dev/null
+++ b/jpeg/libjpeg.txt
@@ -0,0 +1,3070 @@
+USING THE IJG JPEG LIBRARY
+
+Copyright (C) 1994-2009, Thomas G. Lane, Guido Vollbeding.
+This file is part of the Independent JPEG Group's software.
+For conditions of distribution and use, see the accompanying README file.
+
+
+This file describes how to use the IJG JPEG library within an application
+program.  Read it if you want to write a program that uses the library.
+
+The file example.c provides heavily commented skeleton code for calling the
+JPEG library.  Also see jpeglib.h (the include file to be used by application
+programs) for full details about data structures and function parameter lists.
+The library source code, of course, is the ultimate reference.
+
+Note that there have been *major* changes from the application interface
+presented by IJG version 4 and earlier versions.  The old design had several
+inherent limitations, and it had accumulated a lot of cruft as we added
+features while trying to minimize application-interface changes.  We have
+sacrificed backward compatibility in the version 5 rewrite, but we think the
+improvements justify this.
+
+
+TABLE OF CONTENTS
+-----------------
+
+Overview:
+	Functions provided by the library
+	Outline of typical usage
+Basic library usage:
+	Data formats
+	Compression details
+	Decompression details
+	Mechanics of usage: include files, linking, etc
+Advanced features:
+	Compression parameter selection
+	Decompression parameter selection
+	Special color spaces
+	Error handling
+	Compressed data handling (source and destination managers)
+	I/O suspension
+	Progressive JPEG support
+	Buffered-image mode
+	Abbreviated datastreams and multiple images
+	Special markers
+	Raw (downsampled) image data
+	Really raw data: DCT coefficients
+	Progress monitoring
+	Memory management
+	Memory usage
+	Library compile-time options
+	Portability considerations
+	Notes for MS-DOS implementors
+
+You should read at least the overview and basic usage sections before trying
+to program with the library.  The sections on advanced features can be read
+if and when you need them.
+
+
+OVERVIEW
+========
+
+Functions provided by the library
+---------------------------------
+
+The IJG JPEG library provides C code to read and write JPEG-compressed image
+files.  The surrounding application program receives or supplies image data a
+scanline at a time, using a straightforward uncompressed image format.  All
+details of color conversion and other preprocessing/postprocessing can be
+handled by the library.
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  The application indirectly selects use of this code
+by specifying the format in which it wishes to supply or receive image data.
+For example, if colormapped output is requested, then the decompression
+library automatically invokes color quantization.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+A word about functions *not* provided by the library.  We handle a subset of
+the ISO JPEG standard; most baseline, extended-sequential, and progressive
+JPEG processes are supported.  (Our subset includes all features now in common
+use.)  Unsupported ISO options include:
+	* Hierarchical storage
+	* Lossless JPEG
+	* DNL marker
+	* Nonintegral subsampling ratios
+We support both 8- and 12-bit data precision, but this is a compile-time
+choice rather than a run-time choice; hence it is difficult to use both
+precisions in a single application.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, this library is
+used by the free LIBTIFF library to support JPEG compression in TIFF.)
+
+
+Outline of typical usage
+------------------------
+
+The rough outline of a JPEG compression operation is:
+
+	Allocate and initialize a JPEG compression object
+	Specify the destination for the compressed data (eg, a file)
+	Set parameters for compression, including image size & colorspace
+	jpeg_start_compress(...);
+	while (scan lines remain to be written)
+		jpeg_write_scanlines(...);
+	jpeg_finish_compress(...);
+	Release the JPEG compression object
+
+A JPEG compression object holds parameters and working state for the JPEG
+library.  We make creation/destruction of the object separate from starting
+or finishing compression of an image; the same object can be re-used for a
+series of image compression operations.  This makes it easy to re-use the
+same parameter settings for a sequence of images.  Re-use of a JPEG object
+also has important implications for processing abbreviated JPEG datastreams,
+as discussed later.
+
+The image data to be compressed is supplied to jpeg_write_scanlines() from
+in-memory buffers.  If the application is doing file-to-file compression,
+reading image data from the source file is the application's responsibility.
+The library emits compressed data by calling a "data destination manager",
+which typically will write the data into a file; but the application can
+provide its own destination manager to do something else.
+
+Similarly, the rough outline of a JPEG decompression operation is:
+
+	Allocate and initialize a JPEG decompression object
+	Specify the source of the compressed data (eg, a file)
+	Call jpeg_read_header() to obtain image info
+	Set parameters for decompression
+	jpeg_start_decompress(...);
+	while (scan lines remain to be read)
+		jpeg_read_scanlines(...);
+	jpeg_finish_decompress(...);
+	Release the JPEG decompression object
+
+This is comparable to the compression outline except that reading the
+datastream header is a separate step.  This is helpful because information
+about the image's size, colorspace, etc is available when the application
+selects decompression parameters.  For example, the application can choose an
+output scaling ratio that will fit the image into the available screen size.
+
+The decompression library obtains compressed data by calling a data source
+manager, which typically will read the data from a file; but other behaviors
+can be obtained with a custom source manager.  Decompressed data is delivered
+into in-memory buffers passed to jpeg_read_scanlines().
+
+It is possible to abort an incomplete compression or decompression operation
+by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
+simply release it by calling jpeg_destroy().
+
+JPEG compression and decompression objects are two separate struct types.
+However, they share some common fields, and certain routines such as
+jpeg_destroy() can work on either type of object.
+
+The JPEG library has no static variables: all state is in the compression
+or decompression object.  Therefore it is possible to process multiple
+compression and decompression operations concurrently, using multiple JPEG
+objects.
+
+Both compression and decompression can be done in an incremental memory-to-
+memory fashion, if suitable source/destination managers are used.  See the
+section on "I/O suspension" for more details.
+
+
+BASIC LIBRARY USAGE
+===================
+
+Data formats
+------------
+
+Before diving into procedural details, it is helpful to understand the
+image data format that the JPEG library expects or returns.
+
+The standard input image format is a rectangular array of pixels, with each
+pixel having the same number of "component" or "sample" values (color
+channels).  You must specify how many components there are and the colorspace
+interpretation of the components.  Most applications will use RGB data
+(three components per pixel) or grayscale data (one component per pixel).
+PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
+A remarkable number of people manage to miss this, only to find that their
+programs don't work with grayscale JPEG files.
+
+There is no provision for colormapped input.  JPEG files are always full-color
+or full grayscale (or sometimes another colorspace such as CMYK).  You can
+feed in a colormapped image by expanding it to full-color format.  However
+JPEG often doesn't work very well with source data that has been colormapped,
+because of dithering noise.  This is discussed in more detail in the JPEG FAQ
+and the other references mentioned in the README file.
+
+Pixels are stored by scanlines, with each scanline running from left to
+right.  The component values for each pixel are adjacent in the row; for
+example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
+array of data type JSAMPLE --- which is typically "unsigned char", unless
+you've changed jmorecfg.h.  (You can also change the RGB pixel layout, say
+to B,G,R order, by modifying jmorecfg.h.  But see the restrictions listed in
+that file before doing so.)
+
+A 2-D array of pixels is formed by making a list of pointers to the starts of
+scanlines; so the scanlines need not be physically adjacent in memory.  Even
+if you process just one scanline at a time, you must make a one-element
+pointer array to conform to this structure.  Pointers to JSAMPLE rows are of
+type JSAMPROW, and the pointer to the pointer array is of type JSAMPARRAY.
+
+The library accepts or supplies one or more complete scanlines per call.
+It is not possible to process part of a row at a time.  Scanlines are always
+processed top-to-bottom.  You can process an entire image in one call if you
+have it all in memory, but usually it's simplest to process one scanline at
+a time.
+
+For best results, source data values should have the precision specified by
+BITS_IN_JSAMPLE (normally 8 bits).  For instance, if you choose to compress
+data that's only 6 bits/channel, you should left-justify each value in a
+byte before passing it to the compressor.  If you need to compress data
+that has more than 8 bits/channel, compile with BITS_IN_JSAMPLE = 12.
+(See "Library compile-time options", later.)
+
+
+The data format returned by the decompressor is the same in all details,
+except that colormapped output is supported.  (Again, a JPEG file is never
+colormapped.  But you can ask the decompressor to perform on-the-fly color
+quantization to deliver colormapped output.)  If you request colormapped
+output then the returned data array contains a single JSAMPLE per pixel;
+its value is an index into a color map.  The color map is represented as
+a 2-D JSAMPARRAY in which each row holds the values of one color component,
+that is, colormap[i][j] is the value of the i'th color component for pixel
+value (map index) j.  Note that since the colormap indexes are stored in
+JSAMPLEs, the maximum number of colors is limited by the size of JSAMPLE
+(ie, at most 256 colors for an 8-bit JPEG library).
+
+
+Compression details
+-------------------
+
+Here we revisit the JPEG compression outline given in the overview.
+
+1. Allocate and initialize a JPEG compression object.
+
+A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
+a bunch of subsidiary structures which are allocated via malloc(), but the
+application doesn't control those directly.)  This struct can be just a local
+variable in the calling routine, if a single routine is going to execute the
+whole JPEG compression sequence.  Otherwise it can be static or allocated
+from malloc().
+
+You will also need a structure representing a JPEG error handler.  The part
+of this that the library cares about is a "struct jpeg_error_mgr".  If you
+are providing your own error handler, you'll typically want to embed the
+jpeg_error_mgr struct in a larger structure; this is discussed later under
+"Error handling".  For now we'll assume you are just using the default error
+handler.  The default error handler will print JPEG error/warning messages
+on stderr, and it will call exit() if a fatal error occurs.
+
+You must initialize the error handler structure, store a pointer to it into
+the JPEG object's "err" field, and then call jpeg_create_compress() to
+initialize the rest of the JPEG object.
+
+Typical code for this step, if you are using the default error handler, is
+
+	struct jpeg_compress_struct cinfo;
+	struct jpeg_error_mgr jerr;
+	...
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_compress(&cinfo);
+
+jpeg_create_compress allocates a small amount of memory, so it could fail
+if you are out of memory.  In that case it will exit via the error handler;
+that's why the error handler must be initialized first.
+
+
+2. Specify the destination for the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library delivers compressed data to a
+"data destination" module.  The library includes one data destination
+module which knows how to write to a stdio stream.  You can use your own
+destination module if you want to do something else, as discussed later.
+
+If you use the standard destination module, you must open the target stdio
+stream beforehand.  Typical code for this step looks like:
+
+	FILE * outfile;
+	...
+	if ((outfile = fopen(filename, "wb")) == NULL) {
+	    fprintf(stderr, "can't open %s\n", filename);
+	    exit(1);
+	}
+	jpeg_stdio_dest(&cinfo, outfile);
+
+where the last line invokes the standard destination module.
+
+WARNING: it is critical that the binary compressed data be delivered to the
+output file unchanged.  On non-Unix systems the stdio library may perform
+newline translation or otherwise corrupt binary data.  To suppress this
+behavior, you may need to use a "b" option to fopen (as shown above), or use
+setmode() or another routine to put the stdio stream in binary mode.  See
+cjpeg.c and djpeg.c for code that has been found to work on many systems.
+
+You can select the data destination after setting other parameters (step 3),
+if that's more convenient.  You may not change the destination between
+calling jpeg_start_compress() and jpeg_finish_compress().
+
+
+3. Set parameters for compression, including image size & colorspace.
+
+You must supply information about the source image by setting the following
+fields in the JPEG object (cinfo structure):
+
+	image_width		Width of image, in pixels
+	image_height		Height of image, in pixels
+	input_components	Number of color channels (samples per pixel)
+	in_color_space		Color space of source image
+
+The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
+of 1 to 64K pixels in either direction.  The input color space is typically
+RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
+color spaces", later, for more info.)  The in_color_space field must be
+assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
+JCS_GRAYSCALE.
+
+JPEG has a large number of compression parameters that determine how the
+image is encoded.  Most applications don't need or want to know about all
+these parameters.  You can set all the parameters to reasonable defaults by
+calling jpeg_set_defaults(); then, if there are particular values you want
+to change, you can do so after that.  The "Compression parameter selection"
+section tells about all the parameters.
+
+You must set in_color_space correctly before calling jpeg_set_defaults(),
+because the defaults depend on the source image colorspace.  However the
+other three source image parameters need not be valid until you call
+jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
+than once, if that happens to be convenient.
+
+Typical code for a 24-bit RGB source image is
+
+	cinfo.image_width = Width; 	/* image width and height, in pixels */
+	cinfo.image_height = Height;
+	cinfo.input_components = 3;	/* # of color components per pixel */
+	cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
+
+	jpeg_set_defaults(&cinfo);
+	/* Make optional parameter settings here */
+
+
+4. jpeg_start_compress(...);
+
+After you have established the data destination and set all the necessary
+source image info and other parameters, call jpeg_start_compress() to begin
+a compression cycle.  This will initialize internal state, allocate working
+storage, and emit the first few bytes of the JPEG datastream header.
+
+Typical code:
+
+	jpeg_start_compress(&cinfo, TRUE);
+
+The "TRUE" parameter ensures that a complete JPEG interchange datastream
+will be written.  This is appropriate in most cases.  If you think you might
+want to use an abbreviated datastream, read the section on abbreviated
+datastreams, below.
+
+Once you have called jpeg_start_compress(), you may not alter any JPEG
+parameters or other fields of the JPEG object until you have completed
+the compression cycle.
+
+
+5. while (scan lines remain to be written)
+	jpeg_write_scanlines(...);
+
+Now write all the required image data by calling jpeg_write_scanlines()
+one or more times.  You can pass one or more scanlines in each call, up
+to the total image height.  In most applications it is convenient to pass
+just one or a few scanlines at a time.  The expected format for the passed
+data is discussed under "Data formats", above.
+
+Image data should be written in top-to-bottom scanline order.  The JPEG spec
+contains some weasel wording about how top and bottom are application-defined
+terms (a curious interpretation of the English language...) but if you want
+your files to be compatible with everyone else's, you WILL use top-to-bottom
+order.  If the source data must be read in bottom-to-top order, you can use
+the JPEG library's virtual array mechanism to invert the data efficiently.
+Examples of this can be found in the sample application cjpeg.
+
+The library maintains a count of the number of scanlines written so far
+in the next_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.next_scanline < cinfo.image_height)".
+
+Code for this step depends heavily on the way that you store the source data.
+example.c shows the following code for the case of a full-size 2-D source
+array containing 3-byte RGB pixels:
+
+	JSAMPROW row_pointer[1];	/* pointer to a single row */
+	int row_stride;			/* physical row width in buffer */
+
+	row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
+
+	while (cinfo.next_scanline < cinfo.image_height) {
+	    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
+	    jpeg_write_scanlines(&cinfo, row_pointer, 1);
+	}
+
+jpeg_write_scanlines() returns the number of scanlines actually written.
+This will normally be equal to the number passed in, so you can usually
+ignore the return value.  It is different in just two cases:
+  * If you try to write more scanlines than the declared image height,
+    the additional scanlines are ignored.
+  * If you use a suspending data destination manager, output buffer overrun
+    will cause the compressor to return before accepting all the passed lines.
+    This feature is discussed under "I/O suspension", below.  The normal
+    stdio destination manager will NOT cause this to happen.
+In any case, the return value is the same as the change in the value of
+next_scanline.
+
+
+6. jpeg_finish_compress(...);
+
+After all the image data has been written, call jpeg_finish_compress() to
+complete the compression cycle.  This step is ESSENTIAL to ensure that the
+last bufferload of data is written to the data destination.
+jpeg_finish_compress() also releases working memory associated with the JPEG
+object.
+
+Typical code:
+
+	jpeg_finish_compress(&cinfo);
+
+If using the stdio destination manager, don't forget to close the output
+stdio stream (if necessary) afterwards.
+
+If you have requested a multi-pass operating mode, such as Huffman code
+optimization, jpeg_finish_compress() will perform the additional passes using
+data buffered by the first pass.  In this case jpeg_finish_compress() may take
+quite a while to complete.  With the default compression parameters, this will
+not happen.
+
+It is an error to call jpeg_finish_compress() before writing the necessary
+total number of scanlines.  If you wish to abort compression, call
+jpeg_abort() as discussed below.
+
+After completing a compression cycle, you may dispose of the JPEG object
+as discussed next, or you may use it to compress another image.  In that case
+return to step 2, 3, or 4 as appropriate.  If you do not change the
+destination manager, the new datastream will be written to the same target.
+If you do not change any JPEG parameters, the new datastream will be written
+with the same parameters as before.  Note that you can change the input image
+dimensions freely between cycles, but if you change the input colorspace, you
+should call jpeg_set_defaults() to adjust for the new colorspace; and then
+you'll need to repeat all of step 3.
+
+
+7. Release the JPEG compression object.
+
+When you are done with a JPEG compression object, destroy it by calling
+jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
+the previous state of the object).  Or you can call jpeg_destroy(), which
+works for either compression or decompression objects --- this may be more
+convenient if you are sharing code between compression and decompression
+cases.  (Actually, these routines are equivalent except for the declared type
+of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
+should be passed a j_common_ptr.)
+
+If you allocated the jpeg_compress_struct structure from malloc(), freeing
+it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
+handler structure.
+
+Typical code:
+
+	jpeg_destroy_compress(&cinfo);
+
+
+8. Aborting.
+
+If you decide to abort a compression cycle before finishing, you can clean up
+in either of two ways:
+
+* If you don't need the JPEG object any more, just call
+  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
+  legitimate at any point after calling jpeg_create_compress() --- in fact,
+  it's safe even if jpeg_create_compress() fails.
+
+* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
+  jpeg_abort() which works on both compression and decompression objects.
+  This will return the object to an idle state, releasing any working memory.
+  jpeg_abort() is allowed at any time after successful object creation.
+
+Note that cleaning up the data destination, if required, is your
+responsibility; neither of these routines will call term_destination().
+(See "Compressed data handling", below, for more about that.)
+
+jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
+object that has reported an error by calling error_exit (see "Error handling"
+for more info).  The internal state of such an object is likely to be out of
+whack.  Either of these two routines will return the object to a known state.
+
+
+Decompression details
+---------------------
+
+Here we revisit the JPEG decompression outline given in the overview.
+
+1. Allocate and initialize a JPEG decompression object.
+
+This is just like initialization for compression, as discussed above,
+except that the object is a "struct jpeg_decompress_struct" and you
+call jpeg_create_decompress().  Error handling is exactly the same.
+
+Typical code:
+
+	struct jpeg_decompress_struct cinfo;
+	struct jpeg_error_mgr jerr;
+	...
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&cinfo);
+
+(Both here and in the IJG code, we usually use variable name "cinfo" for
+both compression and decompression objects.)
+
+
+2. Specify the source of the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library reads compressed data from a "data
+source" module.  The library includes one data source module which knows how
+to read from a stdio stream.  You can use your own source module if you want
+to do something else, as discussed later.
+
+If you use the standard source module, you must open the source stdio stream
+beforehand.  Typical code for this step looks like:
+
+	FILE * infile;
+	...
+	if ((infile = fopen(filename, "rb")) == NULL) {
+	    fprintf(stderr, "can't open %s\n", filename);
+	    exit(1);
+	}
+	jpeg_stdio_src(&cinfo, infile);
+
+where the last line invokes the standard source module.
+
+WARNING: it is critical that the binary compressed data be read unchanged.
+On non-Unix systems the stdio library may perform newline translation or
+otherwise corrupt binary data.  To suppress this behavior, you may need to use
+a "b" option to fopen (as shown above), or use setmode() or another routine to
+put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
+has been found to work on many systems.
+
+You may not change the data source between calling jpeg_read_header() and
+jpeg_finish_decompress().  If you wish to read a series of JPEG images from
+a single source file, you should repeat the jpeg_read_header() to
+jpeg_finish_decompress() sequence without reinitializing either the JPEG
+object or the data source module; this prevents buffered input data from
+being discarded.
+
+
+3. Call jpeg_read_header() to obtain image info.
+
+Typical code for this step is just
+
+	jpeg_read_header(&cinfo, TRUE);
+
+This will read the source datastream header markers, up to the beginning
+of the compressed data proper.  On return, the image dimensions and other
+info have been stored in the JPEG object.  The application may wish to
+consult this information before selecting decompression parameters.
+
+More complex code is necessary if
+  * A suspending data source is used --- in that case jpeg_read_header()
+    may return before it has read all the header data.  See "I/O suspension",
+    below.  The normal stdio source manager will NOT cause this to happen.
+  * Abbreviated JPEG files are to be processed --- see the section on
+    abbreviated datastreams.  Standard applications that deal only in
+    interchange JPEG files need not be concerned with this case either.
+
+It is permissible to stop at this point if you just wanted to find out the
+image dimensions and other header info for a JPEG file.  In that case,
+call jpeg_destroy() when you are done with the JPEG object, or call
+jpeg_abort() to return it to an idle state before selecting a new data
+source and reading another header.
+
+
+4. Set parameters for decompression.
+
+jpeg_read_header() sets appropriate default decompression parameters based on
+the properties of the image (in particular, its colorspace).  However, you
+may well want to alter these defaults before beginning the decompression.
+For example, the default is to produce full color output from a color file.
+If you want colormapped output you must ask for it.  Other options allow the
+returned image to be scaled and allow various speed/quality tradeoffs to be
+selected.  "Decompression parameter selection", below, gives details.
+
+If the defaults are appropriate, nothing need be done at this step.
+
+Note that all default values are set by each call to jpeg_read_header().
+If you reuse a decompression object, you cannot expect your parameter
+settings to be preserved across cycles, as you can for compression.
+You must set desired parameter values each time.
+
+
+5. jpeg_start_decompress(...);
+
+Once the parameter values are satisfactory, call jpeg_start_decompress() to
+begin decompression.  This will initialize internal state, allocate working
+memory, and prepare for returning data.
+
+Typical code is just
+
+	jpeg_start_decompress(&cinfo);
+
+If you have requested a multi-pass operating mode, such as 2-pass color
+quantization, jpeg_start_decompress() will do everything needed before data
+output can begin.  In this case jpeg_start_decompress() may take quite a while
+to complete.  With a single-scan (non progressive) JPEG file and default
+decompression parameters, this will not happen; jpeg_start_decompress() will
+return quickly.
+
+After this call, the final output image dimensions, including any requested
+scaling, are available in the JPEG object; so is the selected colormap, if
+colormapped output has been requested.  Useful fields include
+
+	output_width		image width and height, as scaled
+	output_height
+	out_color_components	# of color components in out_color_space
+	output_components	# of color components returned per pixel
+	colormap		the selected colormap, if any
+	actual_number_of_colors		number of entries in colormap
+
+output_components is 1 (a colormap index) when quantizing colors; otherwise it
+equals out_color_components.  It is the number of JSAMPLE values that will be
+emitted per pixel in the output arrays.
+
+Typically you will need to allocate data buffers to hold the incoming image.
+You will need output_width * output_components JSAMPLEs per scanline in your
+output buffer, and a total of output_height scanlines will be returned.
+
+Note: if you are using the JPEG library's internal memory manager to allocate
+data buffers (as djpeg does), then the manager's protocol requires that you
+request large buffers *before* calling jpeg_start_decompress().  This is a
+little tricky since the output_XXX fields are not normally valid then.  You
+can make them valid by calling jpeg_calc_output_dimensions() after setting the
+relevant parameters (scaling, output color space, and quantization flag).
+
+
+6. while (scan lines remain to be read)
+	jpeg_read_scanlines(...);
+
+Now you can read the decompressed image data by calling jpeg_read_scanlines()
+one or more times.  At each call, you pass in the maximum number of scanlines
+to be read (ie, the height of your working buffer); jpeg_read_scanlines()
+will return up to that many lines.  The return value is the number of lines
+actually read.  The format of the returned data is discussed under "Data
+formats", above.  Don't forget that grayscale and color JPEGs will return
+different data formats!
+
+Image data is returned in top-to-bottom scanline order.  If you must write
+out the image in bottom-to-top order, you can use the JPEG library's virtual
+array mechanism to invert the data efficiently.  Examples of this can be
+found in the sample application djpeg.
+
+The library maintains a count of the number of scanlines returned so far
+in the output_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
+should NOT be against image_height, unless you never use scaling.  The
+image_height field is the height of the original unscaled image.)
+The return value always equals the change in the value of output_scanline.
+
+If you don't use a suspending data source, it is safe to assume that
+jpeg_read_scanlines() reads at least one scanline per call, until the
+bottom of the image has been reached.
+
+If you use a buffer larger than one scanline, it is NOT safe to assume that
+jpeg_read_scanlines() fills it.  (The current implementation returns only a
+few scanlines per call, no matter how large a buffer you pass.)  So you must
+always provide a loop that calls jpeg_read_scanlines() repeatedly until the
+whole image has been read.
+
+
+7. jpeg_finish_decompress(...);
+
+After all the image data has been read, call jpeg_finish_decompress() to
+complete the decompression cycle.  This causes working memory associated
+with the JPEG object to be released.
+
+Typical code:
+
+	jpeg_finish_decompress(&cinfo);
+
+If using the stdio source manager, don't forget to close the source stdio
+stream if necessary.
+
+It is an error to call jpeg_finish_decompress() before reading the correct
+total number of scanlines.  If you wish to abort decompression, call
+jpeg_abort() as discussed below.
+
+After completing a decompression cycle, you may dispose of the JPEG object as
+discussed next, or you may use it to decompress another image.  In that case
+return to step 2 or 3 as appropriate.  If you do not change the source
+manager, the next image will be read from the same source.
+
+
+8. Release the JPEG decompression object.
+
+When you are done with a JPEG decompression object, destroy it by calling
+jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
+destroying compression objects applies here too.
+
+Typical code:
+
+	jpeg_destroy_decompress(&cinfo);
+
+
+9. Aborting.
+
+You can abort a decompression cycle by calling jpeg_destroy_decompress() or
+jpeg_destroy() if you don't need the JPEG object any more, or
+jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
+The previous discussion of aborting compression cycles applies here too.
+
+
+Mechanics of usage: include files, linking, etc
+-----------------------------------------------
+
+Applications using the JPEG library should include the header file jpeglib.h
+to obtain declarations of data types and routines.  Before including
+jpeglib.h, include system headers that define at least the typedefs FILE and
+size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
+older Unix systems, you may need <sys/types.h> to define size_t.
+
+If the application needs to refer to individual JPEG library error codes, also
+include jerror.h to define those symbols.
+
+jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
+installing the JPEG header files in a system directory, you will want to
+install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
+
+The most convenient way to include the JPEG code into your executable program
+is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
+machines) and reference it at your link step.  If you use only half of the
+library (only compression or only decompression), only that much code will be
+included from the library, unless your linker is hopelessly brain-damaged.
+The supplied makefiles build libjpeg.a automatically (see install.txt).
+
+While you can build the JPEG library as a shared library if the whim strikes
+you, we don't really recommend it.  The trouble with shared libraries is that
+at some point you'll probably try to substitute a new version of the library
+without recompiling the calling applications.  That generally doesn't work
+because the parameter struct declarations usually change with each new
+version.  In other words, the library's API is *not* guaranteed binary
+compatible across versions; we only try to ensure source-code compatibility.
+(In hindsight, it might have been smarter to hide the parameter structs from
+applications and introduce a ton of access functions instead.  Too late now,
+however.)
+
+On some systems your application may need to set up a signal handler to ensure
+that temporary files are deleted if the program is interrupted.  This is most
+critical if you are on MS-DOS and use the jmemdos.c memory manager back end;
+it will try to grab extended memory for temp files, and that space will NOT be
+freed automatically.  See cjpeg.c or djpeg.c for an example signal handler.
+
+It may be worth pointing out that the core JPEG library does not actually
+require the stdio library: only the default source/destination managers and
+error handler need it.  You can use the library in a stdio-less environment
+if you replace those modules and use jmemnobs.c (or another memory manager of
+your own devising).  More info about the minimum system library requirements
+may be found in jinclude.h.
+
+
+ADVANCED FEATURES
+=================
+
+Compression parameter selection
+-------------------------------
+
+This section describes all the optional parameters you can set for JPEG
+compression, as well as the "helper" routines provided to assist in this
+task.  Proper setting of some parameters requires detailed understanding
+of the JPEG standard; if you don't know what a parameter is for, it's best
+not to mess with it!  See REFERENCES in the README file for pointers to
+more info about JPEG.
+
+It's a good idea to call jpeg_set_defaults() first, even if you plan to set
+all the parameters; that way your code is more likely to work with future JPEG
+libraries that have additional parameters.  For the same reason, we recommend
+you use a helper routine where one is provided, in preference to twiddling
+cinfo fields directly.
+
+The helper routines are:
+
+jpeg_set_defaults (j_compress_ptr cinfo)
+	This routine sets all JPEG parameters to reasonable defaults, using
+	only the input image's color space (field in_color_space, which must
+	already be set in cinfo).  Many applications will only need to use
+	this routine and perhaps jpeg_set_quality().
+
+jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+	Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
+	and sets other color-space-dependent parameters appropriately.  See
+	"Special color spaces", below, before using this.  A large number of
+	parameters, including all per-component parameters, are set by this
+	routine; if you want to twiddle individual parameters you should call
+	jpeg_set_colorspace() before rather than after.
+
+jpeg_default_colorspace (j_compress_ptr cinfo)
+	Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
+	and calls jpeg_set_colorspace().  This is actually a subroutine of
+	jpeg_set_defaults().  It's broken out in case you want to change
+	just the colorspace-dependent JPEG parameters.
+
+jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+	Constructs JPEG quantization tables appropriate for the indicated
+	quality setting.  The quality value is expressed on the 0..100 scale
+	recommended by IJG (cjpeg's "-quality" switch uses this routine).
+	Note that the exact mapping from quality values to tables may change
+	in future IJG releases as more is learned about DCT quantization.
+	If the force_baseline parameter is TRUE, then the quantization table
+	entries are constrained to the range 1..255 for full JPEG baseline
+	compatibility.  In the current implementation, this only makes a
+	difference for quality settings below 25, and it effectively prevents
+	very small/low quality files from being generated.  The IJG decoder
+	is capable of reading the non-baseline files generated at low quality
+	settings when force_baseline is FALSE, but other decoders may not be.
+
+jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+			 boolean force_baseline)
+	Same as jpeg_set_quality() except that the generated tables are the
+	sample tables given in the JPEC spec section K.1, multiplied by the
+	specified scale factor (which is expressed as a percentage; thus
+	scale_factor = 100 reproduces the spec's tables).  Note that larger
+	scale factors give lower quality.  This entry point is useful for
+	conforming to the Adobe PostScript DCT conventions, but we do not
+	recommend linear scaling as a user-visible quality scale otherwise.
+	force_baseline again constrains the computed table entries to 1..255.
+
+int jpeg_quality_scaling (int quality)
+	Converts a value on the IJG-recommended quality scale to a linear
+	scaling percentage.  Note that this routine may change or go away
+	in future releases --- IJG may choose to adopt a scaling method that
+	can't be expressed as a simple scalar multiplier, in which case the
+	premise of this routine collapses.  Caveat user.
+
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+	Set default quantization tables with linear q_scale_factor[] values
+	(see below).
+
+jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
+		      const unsigned int *basic_table,
+		      int scale_factor, boolean force_baseline)
+	Allows an arbitrary quantization table to be created.  which_tbl
+	indicates which table slot to fill.  basic_table points to an array
+	of 64 unsigned ints given in normal array order.  These values are
+	multiplied by scale_factor/100 and then clamped to the range 1..65535
+	(or to 1..255 if force_baseline is TRUE).
+	CAUTION: prior to library version 6a, jpeg_add_quant_table expected
+	the basic table to be given in JPEG zigzag order.  If you need to
+	write code that works with either older or newer versions of this
+	routine, you must check the library version number.  Something like
+	"#if JPEG_LIB_VERSION >= 61" is the right test.
+
+jpeg_simple_progression (j_compress_ptr cinfo)
+	Generates a default scan script for writing a progressive-JPEG file.
+	This is the recommended method of creating a progressive file,
+	unless you want to make a custom scan sequence.  You must ensure that
+	the JPEG color space is set correctly before calling this routine.
+
+
+Compression parameters (cinfo fields) include:
+
+J_DCT_METHOD dct_method
+	Selects the algorithm used for the DCT step.  Choices are:
+		JDCT_ISLOW: slow but accurate integer algorithm
+		JDCT_IFAST: faster, less accurate integer method
+		JDCT_FLOAT: floating-point method
+		JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+		JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+	The FLOAT method is very slightly more accurate than the ISLOW method,
+	but may give different results on different machines due to varying
+	roundoff behavior.  The integer methods should give the same results
+	on all machines.  On machines with sufficiently fast FP hardware, the
+	floating-point method may also be the fastest.  The IFAST method is
+	considerably less accurate than the other two; its use is not
+	recommended if high quality is a concern.  JDCT_DEFAULT and
+	JDCT_FASTEST are macros configurable by each installation.
+
+unsigned int scale_num, scale_denom
+	Scale the image by the fraction scale_num/scale_denom.  Default is
+	1/1, or no scaling.  Currently, the supported scaling ratios are
+	8/N with all N from 1 to 16.  (The library design allows for arbitrary
+	scaling ratios but this is not likely to be implemented any time soon.)
+
+J_COLOR_SPACE jpeg_color_space
+int num_components
+	The JPEG color space and corresponding number of components; see
+	"Special color spaces", below, for more info.  We recommend using
+	jpeg_set_color_space() if you want to change these.
+
+boolean optimize_coding
+	TRUE causes the compressor to compute optimal Huffman coding tables
+	for the image.  This requires an extra pass over the data and
+	therefore costs a good deal of space and time.  The default is
+	FALSE, which tells the compressor to use the supplied or default
+	Huffman tables.  In most cases optimal tables save only a few percent
+	of file size compared to the default tables.  Note that when this is
+	TRUE, you need not supply Huffman tables at all, and any you do
+	supply will be overwritten.
+
+unsigned int restart_interval
+int restart_in_rows
+	To emit restart markers in the JPEG file, set one of these nonzero.
+	Set restart_interval to specify the exact interval in MCU blocks.
+	Set restart_in_rows to specify the interval in MCU rows.  (If
+	restart_in_rows is not 0, then restart_interval is set after the
+	image width in MCUs is computed.)  Defaults are zero (no restarts).
+	One restart marker per MCU row is often a good choice.
+	NOTE: the overhead of restart markers is higher in grayscale JPEG
+	files than in color files, and MUCH higher in progressive JPEGs.
+	If you use restarts, you may want to use larger intervals in those
+	cases.
+
+const jpeg_scan_info * scan_info
+int num_scans
+	By default, scan_info is NULL; this causes the compressor to write a
+	single-scan sequential JPEG file.  If not NULL, scan_info points to
+	an array of scan definition records of length num_scans.  The
+	compressor will then write a JPEG file having one scan for each scan
+	definition record.  This is used to generate noninterleaved or
+	progressive JPEG files.  The library checks that the scan array
+	defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
+	a suitable scan definition array for progressive JPEG.)  This is
+	discussed further under "Progressive JPEG support".
+
+boolean do_fancy_downsampling
+	If TRUE, use direct DCT scaling with DCT size > 8 for downsampling
+	of chroma components.
+	If FALSE, use only DCT size <= 8 and simple separate downsampling.
+	Default is TRUE.
+	For better image stability in multiple generation compression cycles
+	it is preferable that this value matches the corresponding
+	do_fancy_upsampling value in decompression.
+
+int smoothing_factor
+	If non-zero, the input image is smoothed; the value should be 1 for
+	minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
+	for details of the smoothing algorithm.  The default is zero.
+
+boolean write_JFIF_header
+	If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
+	jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
+	(ie, YCbCr or grayscale) is selected, otherwise FALSE.
+
+UINT8 JFIF_major_version
+UINT8 JFIF_minor_version
+	The version number to be written into the JFIF marker.
+	jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
+	You should set it to 1.02 (major=1, minor=2) if you plan to write
+	any JFIF 1.02 extension markers.
+
+UINT8 density_unit
+UINT16 X_density
+UINT16 Y_density
+	The resolution information to be written into the JFIF marker;
+	not used otherwise.  density_unit may be 0 for unknown,
+	1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
+	indicating square pixels of unknown size.
+
+boolean write_Adobe_marker
+	If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
+	jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
+	or YCCK is selected, otherwise FALSE.  It is generally a bad idea
+	to set both write_JFIF_header and write_Adobe_marker.  In fact,
+	you probably shouldn't change the default settings at all --- the
+	default behavior ensures that the JPEG file's color space can be
+	recognized by the decoder.
+
+JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]
+	Pointers to coefficient quantization tables, one per table slot,
+	or NULL if no table is defined for a slot.  Usually these should
+	be set via one of the above helper routines; jpeg_add_quant_table()
+	is general enough to define any quantization table.  The other
+	routines will set up table slot 0 for luminance quality and table
+	slot 1 for chrominance.
+
+int q_scale_factor[NUM_QUANT_TBLS]
+	Linear quantization scaling factors (percentage, initialized 100)
+	for use with jpeg_default_qtables().
+	See rdswitch.c and cjpeg.c for an example of usage.
+	Note that the q_scale_factor[] fields are the "linear" scales, so you
+	have to convert from user-defined ratings via jpeg_quality_scaling().
+	Here is an example code which corresponds to cjpeg -quality 90,70:
+
+		jpeg_set_defaults(cinfo);
+
+		/* Set luminance quality 90. */
+		cinfo->q_scale_factor[0] = jpeg_quality_scaling(90);
+		/* Set chrominance quality 70. */
+		cinfo->q_scale_factor[1] = jpeg_quality_scaling(70);
+
+		jpeg_default_qtables(cinfo, force_baseline);
+
+	CAUTION: You must also set 1x1 subsampling for efficient separate
+	color quality selection, since the default value used by library
+	is 2x2:
+
+		cinfo->comp_info[0].v_samp_factor = 1;
+		cinfo->comp_info[0].h_samp_factor = 1;
+
+JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
+	Pointers to Huffman coding tables, one per table slot, or NULL if
+	no table is defined for a slot.  Slots 0 and 1 are filled with the
+	JPEG sample tables by jpeg_set_defaults().  If you need to allocate
+	more table structures, jpeg_alloc_huff_table() may be used.
+	Note that optimal Huffman tables can be computed for an image
+	by setting optimize_coding, as discussed above; there's seldom
+	any need to mess with providing your own Huffman tables.
+
+
+The actual dimensions of the JPEG image that will be written to the file are
+given by the following fields.  These are computed from the input image
+dimensions and the compression parameters by jpeg_start_compress().  You can
+also call jpeg_calc_jpeg_dimensions() to obtain the values that will result
+from the current parameter settings.  This can be useful if you are trying
+to pick a scaling ratio that will get close to a desired target size.
+
+JDIMENSION jpeg_width		Actual dimensions of output image.
+JDIMENSION jpeg_height
+
+
+Per-component parameters are stored in the struct cinfo.comp_info[i] for
+component number i.  Note that components here refer to components of the
+JPEG color space, *not* the source image color space.  A suitably large
+comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
+to use that routine, it's up to you to allocate the array.
+
+int component_id
+	The one-byte identifier code to be recorded in the JPEG file for
+	this component.  For the standard color spaces, we recommend you
+	leave the default values alone.
+
+int h_samp_factor
+int v_samp_factor
+	Horizontal and vertical sampling factors for the component; must
+	be 1..4 according to the JPEG standard.  Note that larger sampling
+	factors indicate a higher-resolution component; many people find
+	this behavior quite unintuitive.  The default values are 2,2 for
+	luminance components and 1,1 for chrominance components, except
+	for grayscale where 1,1 is used.
+
+int quant_tbl_no
+	Quantization table number for component.  The default value is
+	0 for luminance components and 1 for chrominance components.
+
+int dc_tbl_no
+int ac_tbl_no
+	DC and AC entropy coding table numbers.  The default values are
+	0 for luminance components and 1 for chrominance components.
+
+int component_index
+	Must equal the component's index in comp_info[].  (Beginning in
+	release v6, the compressor library will fill this in automatically;
+	you don't have to.)
+
+
+Decompression parameter selection
+---------------------------------
+
+Decompression parameter selection is somewhat simpler than compression
+parameter selection, since all of the JPEG internal parameters are
+recorded in the source file and need not be supplied by the application.
+(Unless you are working with abbreviated files, in which case see
+"Abbreviated datastreams", below.)  Decompression parameters control
+the postprocessing done on the image to deliver it in a format suitable
+for the application's use.  Many of the parameters control speed/quality
+tradeoffs, in which faster decompression may be obtained at the price of
+a poorer-quality image.  The defaults select the highest quality (slowest)
+processing.
+
+The following fields in the JPEG object are set by jpeg_read_header() and
+may be useful to the application in choosing decompression parameters:
+
+JDIMENSION image_width			Width and height of image
+JDIMENSION image_height
+int num_components			Number of color components
+J_COLOR_SPACE jpeg_color_space		Colorspace of image
+boolean saw_JFIF_marker			TRUE if a JFIF APP0 marker was seen
+  UINT8 JFIF_major_version		Version information from JFIF marker
+  UINT8 JFIF_minor_version
+  UINT8 density_unit			Resolution data from JFIF marker
+  UINT16 X_density
+  UINT16 Y_density
+boolean saw_Adobe_marker		TRUE if an Adobe APP14 marker was seen
+  UINT8 Adobe_transform			Color transform code from Adobe marker
+
+The JPEG color space, unfortunately, is something of a guess since the JPEG
+standard proper does not provide a way to record it.  In practice most files
+adhere to the JFIF or Adobe conventions, and the decoder will recognize these
+correctly.  See "Special color spaces", below, for more info.
+
+
+The decompression parameters that determine the basic properties of the
+returned image are:
+
+J_COLOR_SPACE out_color_space
+	Output color space.  jpeg_read_header() sets an appropriate default
+	based on jpeg_color_space; typically it will be RGB or grayscale.
+	The application can change this field to request output in a different
+	colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
+	output from a color file.  (This is useful for previewing: grayscale
+	output is faster than full color since the color components need not
+	be processed.)  Note that not all possible color space transforms are
+	currently implemented; you may need to extend jdcolor.c if you want an
+	unusual conversion.
+
+unsigned int scale_num, scale_denom
+	Scale the image by the fraction scale_num/scale_denom.  Currently,
+	the supported scaling ratios are M/N with all M from 1 to 16, where
+	N is the source DCT size, which is 8 for baseline JPEG.  (The library
+	design allows for arbitrary scaling ratios but this is not likely
+	to be implemented any time soon.)  The values are initialized by
+	jpeg_read_header() with the source DCT size.  For baseline JPEG
+	this is 8/8.  If you change only the scale_num value while leaving
+	the other unchanged, then this specifies the DCT scaled size to be
+	applied on the given input.  For baseline JPEG this is equivalent
+	to M/8 scaling, since the source DCT size for baseline JPEG is 8.
+	Smaller scaling ratios permit significantly faster decoding since
+	fewer pixels need be processed and a simpler IDCT method can be used.
+
+boolean quantize_colors
+	If set TRUE, colormapped output will be delivered.  Default is FALSE,
+	meaning that full-color output will be delivered.
+
+The next three parameters are relevant only if quantize_colors is TRUE.
+
+int desired_number_of_colors
+	Maximum number of colors to use in generating a library-supplied color
+	map (the actual number of colors is returned in a different field).
+	Default 256.  Ignored when the application supplies its own color map.
+
+boolean two_pass_quantize
+	If TRUE, an extra pass over the image is made to select a custom color
+	map for the image.  This usually looks a lot better than the one-size-
+	fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
+	when the application supplies its own color map.
+
+J_DITHER_MODE dither_mode
+	Selects color dithering method.  Supported values are:
+		JDITHER_NONE	no dithering: fast, very low quality
+		JDITHER_ORDERED	ordered dither: moderate speed and quality
+		JDITHER_FS	Floyd-Steinberg dither: slow, high quality
+	Default is JDITHER_FS.  (At present, ordered dither is implemented
+	only in the single-pass, standard-colormap case.  If you ask for
+	ordered dither when two_pass_quantize is TRUE or when you supply
+	an external color map, you'll get F-S dithering.)
+
+When quantize_colors is TRUE, the target color map is described by the next
+two fields.  colormap is set to NULL by jpeg_read_header().  The application
+can supply a color map by setting colormap non-NULL and setting
+actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
+selects a suitable color map and sets these two fields itself.
+[Implementation restriction: at present, an externally supplied colormap is
+only accepted for 3-component output color spaces.]
+
+JSAMPARRAY colormap
+	The color map, represented as a 2-D pixel array of out_color_components
+	rows and actual_number_of_colors columns.  Ignored if not quantizing.
+	CAUTION: if the JPEG library creates its own colormap, the storage
+	pointed to by this field is released by jpeg_finish_decompress().
+	Copy the colormap somewhere else first, if you want to save it.
+
+int actual_number_of_colors
+	The number of colors in the color map.
+
+Additional decompression parameters that the application may set include:
+
+J_DCT_METHOD dct_method
+	Selects the algorithm used for the DCT step.  Choices are the same
+	as described above for compression.
+
+boolean do_fancy_upsampling
+	If TRUE, use direct DCT scaling with DCT size > 8 for upsampling
+	of chroma components.
+	If FALSE, use only DCT size <= 8 and simple separate upsampling.
+	Default is TRUE.
+	For better image stability in multiple generation compression cycles
+	it is preferable that this value matches the corresponding
+	do_fancy_downsampling value in compression.
+
+boolean do_block_smoothing
+	If TRUE, interblock smoothing is applied in early stages of decoding
+	progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
+	progression stages look "fuzzy" with smoothing, "blocky" without.
+	In any case, block smoothing ceases to be applied after the first few
+	AC coefficients are known to full accuracy, so it is relevant only
+	when using buffered-image mode for progressive images.
+
+boolean enable_1pass_quant
+boolean enable_external_quant
+boolean enable_2pass_quant
+	These are significant only in buffered-image mode, which is
+	described in its own section below.
+
+
+The output image dimensions are given by the following fields.  These are
+computed from the source image dimensions and the decompression parameters
+by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
+to obtain the values that will result from the current parameter settings.
+This can be useful if you are trying to pick a scaling ratio that will get
+close to a desired target size.  It's also important if you are using the
+JPEG library's memory manager to allocate output buffer space, because you
+are supposed to request such buffers *before* jpeg_start_decompress().
+
+JDIMENSION output_width		Actual dimensions of output image.
+JDIMENSION output_height
+int out_color_components	Number of color components in out_color_space.
+int output_components		Number of color components returned.
+int rec_outbuf_height		Recommended height of scanline buffer.
+
+When quantizing colors, output_components is 1, indicating a single color map
+index per pixel.  Otherwise it equals out_color_components.  The output arrays
+are required to be output_width * output_components JSAMPLEs wide.
+
+rec_outbuf_height is the recommended minimum height (in scanlines) of the
+buffer passed to jpeg_read_scanlines().  If the buffer is smaller, the
+library will still work, but time will be wasted due to unnecessary data
+copying.  In high-quality modes, rec_outbuf_height is always 1, but some
+faster, lower-quality modes set it to larger values (typically 2 to 4).
+If you are going to ask for a high-speed processing mode, you may as well
+go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
+(An output buffer larger than rec_outbuf_height lines is OK, but won't
+provide any material speed improvement over that height.)
+
+
+Special color spaces
+--------------------
+
+The JPEG standard itself is "color blind" and doesn't specify any particular
+color space.  It is customary to convert color data to a luminance/chrominance
+color space before compressing, since this permits greater compression.  The
+existing de-facto JPEG file format standards specify YCbCr or grayscale data
+(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
+applications such as multispectral images, other color spaces can be used,
+but it must be understood that such files will be unportable.
+
+The JPEG library can handle the most common colorspace conversions (namely
+RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
+color space, passing it through without conversion.  If you deal extensively
+with an unusual color space, you can easily extend the library to understand
+additional color spaces and perform appropriate conversions.
+
+For compression, the source data's color space is specified by field
+in_color_space.  This is transformed to the JPEG file's color space given
+by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
+space depending on in_color_space, but you can override this by calling
+jpeg_set_colorspace().  Of course you must select a supported transformation.
+jccolor.c currently supports the following transformations:
+	RGB => YCbCr
+	RGB => GRAYSCALE
+	YCbCr => GRAYSCALE
+	CMYK => YCCK
+plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
+YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
+
+The de-facto file format standards (JFIF and Adobe) specify APPn markers that
+indicate the color space of the JPEG file.  It is important to ensure that
+these are written correctly, or omitted if the JPEG file's color space is not
+one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
+will set the compression parameters to include or omit the APPn markers
+properly, so long as it is told the truth about the JPEG color space.
+For example, if you are writing some random 3-component color space without
+conversion, don't try to fake out the library by setting in_color_space and
+jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
+APPn marker of your own devising to identify the colorspace --- see "Special
+markers", below.
+
+When told that the color space is UNKNOWN, the library will default to using
+luminance-quality compression parameters for all color components.  You may
+well want to change these parameters.  See the source code for
+jpeg_set_colorspace(), in jcparam.c, for details.
+
+For decompression, the JPEG file's color space is given in jpeg_color_space,
+and this is transformed to the output color space out_color_space.
+jpeg_read_header's setting of jpeg_color_space can be relied on if the file
+conforms to JFIF or Adobe conventions, but otherwise it is no better than a
+guess.  If you know the JPEG file's color space for certain, you can override
+jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
+selects a default output color space based on (its guess of) jpeg_color_space;
+set out_color_space to override this.  Again, you must select a supported
+transformation.  jdcolor.c currently supports
+	YCbCr => GRAYSCALE
+	YCbCr => RGB
+	GRAYSCALE => RGB
+	YCCK => CMYK
+as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
+application can force grayscale JPEGs to look like color JPEGs if it only
+wants to handle one case.)
+
+The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
+(it weights distances appropriately for RGB colors).  You'll need to modify
+the code if you want to use it for non-RGB output color spaces.  Note that
+jquant2.c is used to map to an application-supplied colormap as well as for
+the normal two-pass colormap selection process.
+
+CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
+files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
+This is arguably a bug in Photoshop, but if you need to work with Photoshop
+CMYK files, you will have to deal with it in your application.  We cannot
+"fix" this in the library by inverting the data during the CMYK<=>YCCK
+transform, because that would break other applications, notably Ghostscript.
+Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
+data in the same inverted-YCCK representation used in bare JPEG files, but
+the surrounding PostScript code performs an inversion using the PS image
+operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
+EPS/JPEG files, and will omit the PS-level inversion.  (But the data
+polarity used in bare JPEG files will not change in 3.0.)  In either case,
+the JPEG library must not invert the data itself, or else Ghostscript would
+read these EPS files incorrectly.
+
+
+Error handling
+--------------
+
+When the default error handler is used, any error detected inside the JPEG
+routines will cause a message to be printed on stderr, followed by exit().
+You can supply your own error handling routines to override this behavior
+and to control the treatment of nonfatal warnings and trace/debug messages.
+The file example.c illustrates the most common case, which is to have the
+application regain control after an error rather than exiting.
+
+The JPEG library never writes any message directly; it always goes through
+the error handling routines.  Three classes of messages are recognized:
+  * Fatal errors: the library cannot continue.
+  * Warnings: the library can continue, but the data is corrupt, and a
+    damaged output image is likely to result.
+  * Trace/informational messages.  These come with a trace level indicating
+    the importance of the message; you can control the verbosity of the
+    program by adjusting the maximum trace level that will be displayed.
+
+You may, if you wish, simply replace the entire JPEG error handling module
+(jerror.c) with your own code.  However, you can avoid code duplication by
+only replacing some of the routines depending on the behavior you need.
+This is accomplished by calling jpeg_std_error() as usual, but then overriding
+some of the method pointers in the jpeg_error_mgr struct, as illustrated by
+example.c.
+
+All of the error handling routines will receive a pointer to the JPEG object
+(a j_common_ptr which points to either a jpeg_compress_struct or a
+jpeg_decompress_struct; if you need to tell which, test the is_decompressor
+field).  This struct includes a pointer to the error manager struct in its
+"err" field.  Frequently, custom error handler routines will need to access
+additional data which is not known to the JPEG library or the standard error
+handler.  The most convenient way to do this is to embed either the JPEG
+object or the jpeg_error_mgr struct in a larger structure that contains
+additional fields; then casting the passed pointer provides access to the
+additional fields.  Again, see example.c for one way to do it.  (Beginning
+with IJG version 6b, there is also a void pointer "client_data" in each
+JPEG object, which the application can also use to find related data.
+The library does not touch client_data at all.)
+
+The individual methods that you might wish to override are:
+
+error_exit (j_common_ptr cinfo)
+	Receives control for a fatal error.  Information sufficient to
+	generate the error message has been stored in cinfo->err; call
+	output_message to display it.  Control must NOT return to the caller;
+	generally this routine will exit() or longjmp() somewhere.
+	Typically you would override this routine to get rid of the exit()
+	default behavior.  Note that if you continue processing, you should
+	clean up the JPEG object with jpeg_abort() or jpeg_destroy().
+
+output_message (j_common_ptr cinfo)
+	Actual output of any JPEG message.  Override this to send messages
+	somewhere other than stderr.  Note that this method does not know
+	how to generate a message, only where to send it.
+
+format_message (j_common_ptr cinfo, char * buffer)
+	Constructs a readable error message string based on the error info
+	stored in cinfo->err.  This method is called by output_message.  Few
+	applications should need to override this method.  One possible
+	reason for doing so is to implement dynamic switching of error message
+	language.
+
+emit_message (j_common_ptr cinfo, int msg_level)
+	Decide whether or not to emit a warning or trace message; if so,
+	calls output_message.  The main reason for overriding this method
+	would be to abort on warnings.  msg_level is -1 for warnings,
+	0 and up for trace messages.
+
+Only error_exit() and emit_message() are called from the rest of the JPEG
+library; the other two are internal to the error handler.
+
+The actual message texts are stored in an array of strings which is pointed to
+by the field err->jpeg_message_table.  The messages are numbered from 0 to
+err->last_jpeg_message, and it is these code numbers that are used in the
+JPEG library code.  You could replace the message texts (for instance, with
+messages in French or German) by changing the message table pointer.  See
+jerror.h for the default texts.  CAUTION: this table will almost certainly
+change or grow from one library version to the next.
+
+It may be useful for an application to add its own message texts that are
+handled by the same mechanism.  The error handler supports a second "add-on"
+message table for this purpose.  To define an addon table, set the pointer
+err->addon_message_table and the message numbers err->first_addon_message and
+err->last_addon_message.  If you number the addon messages beginning at 1000
+or so, you won't have to worry about conflicts with the library's built-in
+messages.  See the sample applications cjpeg/djpeg for an example of using
+addon messages (the addon messages are defined in cderror.h).
+
+Actual invocation of the error handler is done via macros defined in jerror.h:
+	ERREXITn(...)	for fatal errors
+	WARNMSn(...)	for corrupt-data warnings
+	TRACEMSn(...)	for trace and informational messages.
+These macros store the message code and any additional parameters into the
+error handler struct, then invoke the error_exit() or emit_message() method.
+The variants of each macro are for varying numbers of additional parameters.
+The additional parameters are inserted into the generated message using
+standard printf() format codes.
+
+See jerror.h and jerror.c for further details.
+
+
+Compressed data handling (source and destination managers)
+----------------------------------------------------------
+
+The JPEG compression library sends its compressed data to a "destination
+manager" module.  The default destination manager just writes the data to a
+memory buffer or to a stdio stream, but you can provide your own manager to
+do something else.  Similarly, the decompression library calls a "source
+manager" to obtain the compressed data; you can provide your own source
+manager if you want the data to come from somewhere other than a memory
+buffer or a stdio stream.
+
+In both cases, compressed data is processed a bufferload at a time: the
+destination or source manager provides a work buffer, and the library invokes
+the manager only when the buffer is filled or emptied.  (You could define a
+one-character buffer to force the manager to be invoked for each byte, but
+that would be rather inefficient.)  The buffer's size and location are
+controlled by the manager, not by the library.  For example, the memory
+source manager just makes the buffer pointer and length point to the original
+data in memory.  In this case the buffer-reload procedure will be invoked
+only if the decompressor ran off the end of the datastream, which would
+indicate an erroneous datastream.
+
+The work buffer is defined as an array of datatype JOCTET, which is generally
+"char" or "unsigned char".  On a machine where char is not exactly 8 bits
+wide, you must define JOCTET as a wider data type and then modify the data
+source and destination modules to transcribe the work arrays into 8-bit units
+on external storage.
+
+A data destination manager struct contains a pointer and count defining the
+next byte to write in the work buffer and the remaining free space:
+
+	JOCTET * next_output_byte;  /* => next byte to write in buffer */
+	size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is filled.  The manager's empty_output_buffer method must reset the pointer
+and count.  The manager is expected to remember the buffer's starting address
+and total size in private fields not visible to the library.
+
+A data destination manager provides three methods:
+
+init_destination (j_compress_ptr cinfo)
+	Initialize destination.  This is called by jpeg_start_compress()
+	before any data is actually written.  It must initialize
+	next_output_byte and free_in_buffer.  free_in_buffer must be
+	initialized to a positive value.
+
+empty_output_buffer (j_compress_ptr cinfo)
+	This is called whenever the buffer has filled (free_in_buffer
+	reaches zero).  In typical applications, it should write out the
+	*entire* buffer (use the saved start address and buffer length;
+	ignore the current state of next_output_byte and free_in_buffer).
+	Then reset the pointer & count to the start of the buffer, and
+	return TRUE indicating that the buffer has been dumped.
+	free_in_buffer must be set to a positive value when TRUE is
+	returned.  A FALSE return should only be used when I/O suspension is
+	desired (this operating mode is discussed in the next section).
+
+term_destination (j_compress_ptr cinfo)
+	Terminate destination --- called by jpeg_finish_compress() after all
+	data has been written.  In most applications, this must flush any
+	data remaining in the buffer.  Use either next_output_byte or
+	free_in_buffer to determine how much data is in the buffer.
+
+term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
+want the destination manager to be cleaned up during an abort, you must do it
+yourself.
+
+You will also need code to create a jpeg_destination_mgr struct, fill in its
+method pointers, and insert a pointer to the struct into the "dest" field of
+the JPEG compression object.  This can be done in-line in your setup code if
+you like, but it's probably cleaner to provide a separate routine similar to
+the jpeg_stdio_dest() or jpeg_mem_dest() routines of the supplied destination
+managers.
+
+Decompression source managers follow a parallel design, but with some
+additional frammishes.  The source manager struct contains a pointer and count
+defining the next byte to read from the work buffer and the number of bytes
+remaining:
+
+	const JOCTET * next_input_byte; /* => next byte to read from buffer */
+	size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is emptied.  The manager's fill_input_buffer method must reset the pointer and
+count.  In most applications, the manager must remember the buffer's starting
+address and total size in private fields not visible to the library.
+
+A data source manager provides five methods:
+
+init_source (j_decompress_ptr cinfo)
+	Initialize source.  This is called by jpeg_read_header() before any
+	data is actually read.  Unlike init_destination(), it may leave
+	bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
+	will occur immediately).
+
+fill_input_buffer (j_decompress_ptr cinfo)
+	This is called whenever bytes_in_buffer has reached zero and more
+	data is wanted.  In typical applications, it should read fresh data
+	into the buffer (ignoring the current state of next_input_byte and
+	bytes_in_buffer), reset the pointer & count to the start of the
+	buffer, and return TRUE indicating that the buffer has been reloaded.
+	It is not necessary to fill the buffer entirely, only to obtain at
+	least one more byte.  bytes_in_buffer MUST be set to a positive value
+	if TRUE is returned.  A FALSE return should only be used when I/O
+	suspension is desired (this mode is discussed in the next section).
+
+skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+	Skip num_bytes worth of data.  The buffer pointer and count should
+	be advanced over num_bytes input bytes, refilling the buffer as
+	needed.  This is used to skip over a potentially large amount of
+	uninteresting data (such as an APPn marker).  In some applications
+	it may be possible to optimize away the reading of the skipped data,
+	but it's not clear that being smart is worth much trouble; large
+	skips are uncommon.  bytes_in_buffer may be zero on return.
+	A zero or negative skip count should be treated as a no-op.
+
+resync_to_restart (j_decompress_ptr cinfo, int desired)
+	This routine is called only when the decompressor has failed to find
+	a restart (RSTn) marker where one is expected.  Its mission is to
+	find a suitable point for resuming decompression.  For most
+	applications, we recommend that you just use the default resync
+	procedure, jpeg_resync_to_restart().  However, if you are able to back
+	up in the input data stream, or if you have a-priori knowledge about
+	the likely location of restart markers, you may be able to do better.
+	Read the read_restart_marker() and jpeg_resync_to_restart() routines
+	in jdmarker.c if you think you'd like to implement your own resync
+	procedure.
+
+term_source (j_decompress_ptr cinfo)
+	Terminate source --- called by jpeg_finish_decompress() after all
+	data has been read.  Often a no-op.
+
+For both fill_input_buffer() and skip_input_data(), there is no such thing
+as an EOF return.  If the end of the file has been reached, the routine has
+a choice of exiting via ERREXIT() or inserting fake data into the buffer.
+In most cases, generating a warning message and inserting a fake EOI marker
+is the best course of action --- this will allow the decompressor to output
+however much of the image is there.  In pathological cases, the decompressor
+may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
+jdatasrc.c illustrates the recommended error recovery behavior.
+
+term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
+the source manager to be cleaned up during an abort, you must do it yourself.
+
+You will also need code to create a jpeg_source_mgr struct, fill in its method
+pointers, and insert a pointer to the struct into the "src" field of the JPEG
+decompression object.  This can be done in-line in your setup code if you
+like, but it's probably cleaner to provide a separate routine similar to the
+jpeg_stdio_src() or jpeg_mem_src() routines of the supplied source managers.
+
+For more information, consult the memory and stdio source and destination
+managers in jdatasrc.c and jdatadst.c.
+
+
+I/O suspension
+--------------
+
+Some applications need to use the JPEG library as an incremental memory-to-
+memory filter: when the compressed data buffer is filled or emptied, they want
+control to return to the outer loop, rather than expecting that the buffer can
+be emptied or reloaded within the data source/destination manager subroutine.
+The library supports this need by providing an "I/O suspension" mode, which we
+describe in this section.
+
+The I/O suspension mode is not a panacea: nothing is guaranteed about the
+maximum amount of time spent in any one call to the library, so it will not
+eliminate response-time problems in single-threaded applications.  If you
+need guaranteed response time, we suggest you "bite the bullet" and implement
+a real multi-tasking capability.
+
+To use I/O suspension, cooperation is needed between the calling application
+and the data source or destination manager; you will always need a custom
+source/destination manager.  (Please read the previous section if you haven't
+already.)  The basic idea is that the empty_output_buffer() or
+fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
+that it has done nothing.  Upon seeing this, the JPEG library suspends
+operation and returns to its caller.  The surrounding application is
+responsible for emptying or refilling the work buffer before calling the
+JPEG library again.
+
+Compression suspension:
+
+For compression suspension, use an empty_output_buffer() routine that returns
+FALSE; typically it will not do anything else.  This will cause the
+compressor to return to the caller of jpeg_write_scanlines(), with the return
+value indicating that not all the supplied scanlines have been accepted.
+The application must make more room in the output buffer, adjust the output
+buffer pointer/count appropriately, and then call jpeg_write_scanlines()
+again, pointing to the first unconsumed scanline.
+
+When forced to suspend, the compressor will backtrack to a convenient stopping
+point (usually the start of the current MCU); it will regenerate some output
+data when restarted.  Therefore, although empty_output_buffer() is only
+called when the buffer is filled, you should NOT write out the entire buffer
+after a suspension.  Write only the data up to the current position of
+next_output_byte/free_in_buffer.  The data beyond that point will be
+regenerated after resumption.
+
+Because of the backtracking behavior, a good-size output buffer is essential
+for efficiency; you don't want the compressor to suspend often.  (In fact, an
+overly small buffer could lead to infinite looping, if a single MCU required
+more data than would fit in the buffer.)  We recommend a buffer of at least
+several Kbytes.  You may want to insert explicit code to ensure that you don't
+call jpeg_write_scanlines() unless there is a reasonable amount of space in
+the output buffer; in other words, flush the buffer before trying to compress
+more data.
+
+The compressor does not allow suspension while it is trying to write JPEG
+markers at the beginning and end of the file.  This means that:
+  * At the beginning of a compression operation, there must be enough free
+    space in the output buffer to hold the header markers (typically 600 or
+    so bytes).  The recommended buffer size is bigger than this anyway, so
+    this is not a problem as long as you start with an empty buffer.  However,
+    this restriction might catch you if you insert large special markers, such
+    as a JFIF thumbnail image, without flushing the buffer afterwards.
+  * When you call jpeg_finish_compress(), there must be enough space in the
+    output buffer to emit any buffered data and the final EOI marker.  In the
+    current implementation, half a dozen bytes should suffice for this, but
+    for safety's sake we recommend ensuring that at least 100 bytes are free
+    before calling jpeg_finish_compress().
+
+A more significant restriction is that jpeg_finish_compress() cannot suspend.
+This means you cannot use suspension with multi-pass operating modes, namely
+Huffman code optimization and multiple-scan output.  Those modes write the
+whole file during jpeg_finish_compress(), which will certainly result in
+buffer overrun.  (Note that this restriction applies only to compression,
+not decompression.  The decompressor supports input suspension in all of its
+operating modes.)
+
+Decompression suspension:
+
+For decompression suspension, use a fill_input_buffer() routine that simply
+returns FALSE (except perhaps during error recovery, as discussed below).
+This will cause the decompressor to return to its caller with an indication
+that suspension has occurred.  This can happen at four places:
+  * jpeg_read_header(): will return JPEG_SUSPENDED.
+  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
+  * jpeg_read_scanlines(): will return the number of scanlines already
+	completed (possibly 0).
+  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
+The surrounding application must recognize these cases, load more data into
+the input buffer, and repeat the call.  In the case of jpeg_read_scanlines(),
+increment the passed pointers past any scanlines successfully read.
+
+Just as with compression, the decompressor will typically backtrack to a
+convenient restart point before suspending.  When fill_input_buffer() is
+called, next_input_byte/bytes_in_buffer point to the current restart point,
+which is where the decompressor will backtrack to if FALSE is returned.
+The data beyond that position must NOT be discarded if you suspend; it needs
+to be re-read upon resumption.  In most implementations, you'll need to shift
+this data down to the start of your work buffer and then load more data after
+it.  Again, this behavior means that a several-Kbyte work buffer is essential
+for decent performance; furthermore, you should load a reasonable amount of
+new data before resuming decompression.  (If you loaded, say, only one new
+byte each time around, you could waste a LOT of cycles.)
+
+The skip_input_data() source manager routine requires special care in a
+suspension scenario.  This routine is NOT granted the ability to suspend the
+decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
+requested skip distance exceeds the amount of data currently in the input
+buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
+additional skip distance somewhere else.  The decompressor will immediately
+call fill_input_buffer(), which should return FALSE, which will cause a
+suspension return.  The surrounding application must then arrange to discard
+the recorded number of bytes before it resumes loading the input buffer.
+(Yes, this design is rather baroque, but it avoids complexity in the far more
+common case where a non-suspending source manager is used.)
+
+If the input data has been exhausted, we recommend that you emit a warning
+and insert dummy EOI markers just as a non-suspending data source manager
+would do.  This can be handled either in the surrounding application logic or
+within fill_input_buffer(); the latter is probably more efficient.  If
+fill_input_buffer() knows that no more data is available, it can set the
+pointer/count to point to a dummy EOI marker and then return TRUE just as
+though it had read more data in a non-suspending situation.
+
+The decompressor does not attempt to suspend within standard JPEG markers;
+instead it will backtrack to the start of the marker and reprocess the whole
+marker next time.  Hence the input buffer must be large enough to hold the
+longest standard marker in the file.  Standard JPEG markers should normally
+not exceed a few hundred bytes each (DHT tables are typically the longest).
+We recommend at least a 2K buffer for performance reasons, which is much
+larger than any correct marker is likely to be.  For robustness against
+damaged marker length counts, you may wish to insert a test in your
+application for the case that the input buffer is completely full and yet
+the decoder has suspended without consuming any data --- otherwise, if this
+situation did occur, it would lead to an endless loop.  (The library can't
+provide this test since it has no idea whether "the buffer is full", or
+even whether there is a fixed-size input buffer.)
+
+The input buffer would need to be 64K to allow for arbitrary COM or APPn
+markers, but these are handled specially: they are either saved into allocated
+memory, or skipped over by calling skip_input_data().  In the former case,
+suspension is handled correctly, and in the latter case, the problem of
+buffer overrun is placed on skip_input_data's shoulders, as explained above.
+Note that if you provide your own marker handling routine for large markers,
+you should consider how to deal with buffer overflow.
+
+Multiple-buffer management:
+
+In some applications it is desirable to store the compressed data in a linked
+list of buffer areas, so as to avoid data copying.  This can be handled by
+having empty_output_buffer() or fill_input_buffer() set the pointer and count
+to reference the next available buffer; FALSE is returned only if no more
+buffers are available.  Although seemingly straightforward, there is a
+pitfall in this approach: the backtrack that occurs when FALSE is returned
+could back up into an earlier buffer.  For example, when fill_input_buffer()
+is called, the current pointer & count indicate the backtrack restart point.
+Since fill_input_buffer() will set the pointer and count to refer to a new
+buffer, the restart position must be saved somewhere else.  Suppose a second
+call to fill_input_buffer() occurs in the same library call, and no
+additional input data is available, so fill_input_buffer must return FALSE.
+If the JPEG library has not moved the pointer/count forward in the current
+buffer, then *the correct restart point is the saved position in the prior
+buffer*.  Prior buffers may be discarded only after the library establishes
+a restart point within a later buffer.  Similar remarks apply for output into
+a chain of buffers.
+
+The library will never attempt to backtrack over a skip_input_data() call,
+so any skipped data can be permanently discarded.  You still have to deal
+with the case of skipping not-yet-received data, however.
+
+It's much simpler to use only a single buffer; when fill_input_buffer() is
+called, move any unconsumed data (beyond the current pointer/count) down to
+the beginning of this buffer and then load new data into the remaining buffer
+space.  This approach requires a little more data copying but is far easier
+to get right.
+
+
+Progressive JPEG support
+------------------------
+
+Progressive JPEG rearranges the stored data into a series of scans of
+increasing quality.  In situations where a JPEG file is transmitted across a
+slow communications link, a decoder can generate a low-quality image very
+quickly from the first scan, then gradually improve the displayed quality as
+more scans are received.  The final image after all scans are complete is
+identical to that of a regular (sequential) JPEG file of the same quality
+setting.  Progressive JPEG files are often slightly smaller than equivalent
+sequential JPEG files, but the possibility of incremental display is the main
+reason for using progressive JPEG.
+
+The IJG encoder library generates progressive JPEG files when given a
+suitable "scan script" defining how to divide the data into scans.
+Creation of progressive JPEG files is otherwise transparent to the encoder.
+Progressive JPEG files can also be read transparently by the decoder library.
+If the decoding application simply uses the library as defined above, it
+will receive a final decoded image without any indication that the file was
+progressive.  Of course, this approach does not allow incremental display.
+To perform incremental display, an application needs to use the decoder
+library's "buffered-image" mode, in which it receives a decoded image
+multiple times.
+
+Each displayed scan requires about as much work to decode as a full JPEG
+image of the same size, so the decoder must be fairly fast in relation to the
+data transmission rate in order to make incremental display useful.  However,
+it is possible to skip displaying the image and simply add the incoming bits
+to the decoder's coefficient buffer.  This is fast because only Huffman
+decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
+The IJG decoder library allows the application to switch dynamically between
+displaying the image and simply absorbing the incoming bits.  A properly
+coded application can automatically adapt the number of display passes to
+suit the time available as the image is received.  Also, a final
+higher-quality display cycle can be performed from the buffered data after
+the end of the file is reached.
+
+Progressive compression:
+
+To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
+set the scan_info cinfo field to point to an array of scan descriptors, and
+perform compression as usual.  Instead of constructing your own scan list,
+you can call the jpeg_simple_progression() helper routine to create a
+recommended progression sequence; this method should be used by all
+applications that don't want to get involved in the nitty-gritty of
+progressive scan sequence design.  (If you want to provide user control of
+scan sequences, you may wish to borrow the scan script reading code found
+in rdswitch.c, so that you can read scan script files just like cjpeg's.)
+When scan_info is not NULL, the compression library will store DCT'd data
+into a buffer array as jpeg_write_scanlines() is called, and will emit all
+the requested scans during jpeg_finish_compress().  This implies that
+multiple-scan output cannot be created with a suspending data destination
+manager, since jpeg_finish_compress() does not support suspension.  We
+should also note that the compressor currently forces Huffman optimization
+mode when creating a progressive JPEG file, because the default Huffman
+tables are unsuitable for progressive files.
+
+Progressive decompression:
+
+When buffered-image mode is not used, the decoder library will read all of
+a multi-scan file during jpeg_start_decompress(), so that it can provide a
+final decoded image.  (Here "multi-scan" means either progressive or
+multi-scan sequential.)  This makes multi-scan files transparent to the
+decoding application.  However, existing applications that used suspending
+input with version 5 of the IJG library will need to be modified to check
+for a suspension return from jpeg_start_decompress().
+
+To perform incremental display, an application must use the library's
+buffered-image mode.  This is described in the next section.
+
+
+Buffered-image mode
+-------------------
+
+In buffered-image mode, the library stores the partially decoded image in a
+coefficient buffer, from which it can be read out as many times as desired.
+This mode is typically used for incremental display of progressive JPEG files,
+but it can be used with any JPEG file.  Each scan of a progressive JPEG file
+adds more data (more detail) to the buffered image.  The application can
+display in lockstep with the source file (one display pass per input scan),
+or it can allow input processing to outrun display processing.  By making
+input and display processing run independently, it is possible for the
+application to adapt progressive display to a wide range of data transmission
+rates.
+
+The basic control flow for buffered-image decoding is
+
+	jpeg_create_decompress()
+	set data source
+	jpeg_read_header()
+	set overall decompression parameters
+	cinfo.buffered_image = TRUE;	/* select buffered-image mode */
+	jpeg_start_decompress()
+	for (each output pass) {
+	    adjust output decompression parameters if required
+	    jpeg_start_output()		/* start a new output pass */
+	    for (all scanlines in image) {
+	        jpeg_read_scanlines()
+	        display scanlines
+	    }
+	    jpeg_finish_output()	/* terminate output pass */
+	}
+	jpeg_finish_decompress()
+	jpeg_destroy_decompress()
+
+This differs from ordinary unbuffered decoding in that there is an additional
+level of looping.  The application can choose how many output passes to make
+and how to display each pass.
+
+The simplest approach to displaying progressive images is to do one display
+pass for each scan appearing in the input file.  In this case the outer loop
+condition is typically
+	while (! jpeg_input_complete(&cinfo))
+and the start-output call should read
+	jpeg_start_output(&cinfo, cinfo.input_scan_number);
+The second parameter to jpeg_start_output() indicates which scan of the input
+file is to be displayed; the scans are numbered starting at 1 for this
+purpose.  (You can use a loop counter starting at 1 if you like, but using
+the library's input scan counter is easier.)  The library automatically reads
+data as necessary to complete each requested scan, and jpeg_finish_output()
+advances to the next scan or end-of-image marker (hence input_scan_number
+will be incremented by the time control arrives back at jpeg_start_output()).
+With this technique, data is read from the input file only as needed, and
+input and output processing run in lockstep.
+
+After reading the final scan and reaching the end of the input file, the
+buffered image remains available; it can be read additional times by
+repeating the jpeg_start_output()/jpeg_read_scanlines()/jpeg_finish_output()
+sequence.  For example, a useful technique is to use fast one-pass color
+quantization for display passes made while the image is arriving, followed by
+a final display pass using two-pass quantization for highest quality.  This
+is done by changing the library parameters before the final output pass.
+Changing parameters between passes is discussed in detail below.
+
+In general the last scan of a progressive file cannot be recognized as such
+until after it is read, so a post-input display pass is the best approach if
+you want special processing in the final pass.
+
+When done with the image, be sure to call jpeg_finish_decompress() to release
+the buffered image (or just use jpeg_destroy_decompress()).
+
+If input data arrives faster than it can be displayed, the application can
+cause the library to decode input data in advance of what's needed to produce
+output.  This is done by calling the routine jpeg_consume_input().
+The return value is one of the following:
+	JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
+	JPEG_REACHED_EOI:    reached the EOI marker (end of image)
+	JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
+	JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
+	JPEG_SUSPENDED:      suspended before completing any of the above
+(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
+routine can be called at any time after initializing the JPEG object.  It
+reads some additional data and returns when one of the indicated significant
+events occurs.  (If called after the EOI marker is reached, it will
+immediately return JPEG_REACHED_EOI without attempting to read more data.)
+
+The library's output processing will automatically call jpeg_consume_input()
+whenever the output processing overtakes the input; thus, simple lockstep
+display requires no direct calls to jpeg_consume_input().  But by adding
+calls to jpeg_consume_input(), you can absorb data in advance of what is
+being displayed.  This has two benefits:
+  * You can limit buildup of unprocessed data in your input buffer.
+  * You can eliminate extra display passes by paying attention to the
+    state of the library's input processing.
+
+The first of these benefits only requires interspersing calls to
+jpeg_consume_input() with your display operations and any other processing
+you may be doing.  To avoid wasting cycles due to backtracking, it's best to
+call jpeg_consume_input() only after a hundred or so new bytes have arrived.
+This is discussed further under "I/O suspension", above.  (Note: the JPEG
+library currently is not thread-safe.  You must not call jpeg_consume_input()
+from one thread of control if a different library routine is working on the
+same JPEG object in another thread.)
+
+When input arrives fast enough that more than one new scan is available
+before you start a new output pass, you may as well skip the output pass
+corresponding to the completed scan.  This occurs for free if you pass
+cinfo.input_scan_number as the target scan number to jpeg_start_output().
+The input_scan_number field is simply the index of the scan currently being
+consumed by the input processor.  You can ensure that this is up-to-date by
+emptying the input buffer just before calling jpeg_start_output(): call
+jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
+JPEG_REACHED_EOI.
+
+The target scan number passed to jpeg_start_output() is saved in the
+cinfo.output_scan_number field.  The library's output processing calls
+jpeg_consume_input() whenever the current input scan number and row within
+that scan is less than or equal to the current output scan number and row.
+Thus, input processing can "get ahead" of the output processing but is not
+allowed to "fall behind".  You can achieve several different effects by
+manipulating this interlock rule.  For example, if you pass a target scan
+number greater than the current input scan number, the output processor will
+wait until that scan starts to arrive before producing any output.  (To avoid
+an infinite loop, the target scan number is automatically reset to the last
+scan number when the end of image is reached.  Thus, if you specify a large
+target scan number, the library will just absorb the entire input file and
+then perform an output pass.  This is effectively the same as what
+jpeg_start_decompress() does when you don't select buffered-image mode.)
+When you pass a target scan number equal to the current input scan number,
+the image is displayed no faster than the current input scan arrives.  The
+final possibility is to pass a target scan number less than the current input
+scan number; this disables the input/output interlock and causes the output
+processor to simply display whatever it finds in the image buffer, without
+waiting for input.  (However, the library will not accept a target scan
+number less than one, so you can't avoid waiting for the first scan.)
+
+When data is arriving faster than the output display processing can advance
+through the image, jpeg_consume_input() will store data into the buffered
+image beyond the point at which the output processing is reading data out
+again.  If the input arrives fast enough, it may "wrap around" the buffer to
+the point where the input is more than one whole scan ahead of the output.
+If the output processing simply proceeds through its display pass without
+paying attention to the input, the effect seen on-screen is that the lower
+part of the image is one or more scans better in quality than the upper part.
+Then, when the next output scan is started, you have a choice of what target
+scan number to use.  The recommended choice is to use the current input scan
+number at that time, which implies that you've skipped the output scans
+corresponding to the input scans that were completed while you processed the
+previous output scan.  In this way, the decoder automatically adapts its
+speed to the arriving data, by skipping output scans as necessary to keep up
+with the arriving data.
+
+When using this strategy, you'll want to be sure that you perform a final
+output pass after receiving all the data; otherwise your last display may not
+be full quality across the whole screen.  So the right outer loop logic is
+something like this:
+	do {
+	    absorb any waiting input by calling jpeg_consume_input()
+	    final_pass = jpeg_input_complete(&cinfo);
+	    adjust output decompression parameters if required
+	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
+	    ...
+	    jpeg_finish_output()
+	} while (! final_pass);
+rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
+arrangement makes it simple to use higher-quality decoding parameters
+for the final pass.  But if you don't want to use special parameters for
+the final pass, the right loop logic is like this:
+	for (;;) {
+	    absorb any waiting input by calling jpeg_consume_input()
+	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
+	    ...
+	    jpeg_finish_output()
+	    if (jpeg_input_complete(&cinfo) &&
+	        cinfo.input_scan_number == cinfo.output_scan_number)
+	      break;
+	}
+In this case you don't need to know in advance whether an output pass is to
+be the last one, so it's not necessary to have reached EOF before starting
+the final output pass; rather, what you want to test is whether the output
+pass was performed in sync with the final input scan.  This form of the loop
+will avoid an extra output pass whenever the decoder is able (or nearly able)
+to keep up with the incoming data.
+
+When the data transmission speed is high, you might begin a display pass,
+then find that much or all of the file has arrived before you can complete
+the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
+from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
+In this situation you may wish to abort the current display pass and start a
+new one using the newly arrived information.  To do so, just call
+jpeg_finish_output() and then start a new pass with jpeg_start_output().
+
+A variant strategy is to abort and restart display if more than one complete
+scan arrives during an output pass; this can be detected by noting
+JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
+idea should be employed with caution, however, since the display process
+might never get to the bottom of the image before being aborted, resulting
+in the lower part of the screen being several passes worse than the upper.
+In most cases it's probably best to abort an output pass only if the whole
+file has arrived and you want to begin the final output pass immediately.
+
+When receiving data across a communication link, we recommend always using
+the current input scan number for the output target scan number; if a
+higher-quality final pass is to be done, it should be started (aborting any
+incomplete output pass) as soon as the end of file is received.  However,
+many other strategies are possible.  For example, the application can examine
+the parameters of the current input scan and decide whether to display it or
+not.  If the scan contains only chroma data, one might choose not to use it
+as the target scan, expecting that the scan will be small and will arrive
+quickly.  To skip to the next scan, call jpeg_consume_input() until it
+returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
+number as the target scan for jpeg_start_output(); but that method doesn't
+let you inspect the next scan's parameters before deciding to display it.
+
+
+In buffered-image mode, jpeg_start_decompress() never performs input and
+thus never suspends.  An application that uses input suspension with
+buffered-image mode must be prepared for suspension returns from these
+routines:
+* jpeg_start_output() performs input only if you request 2-pass quantization
+  and the target scan isn't fully read yet.  (This is discussed below.)
+* jpeg_read_scanlines(), as always, returns the number of scanlines that it
+  was able to produce before suspending.
+* jpeg_finish_output() will read any markers following the target scan,
+  up to the end of the file or the SOS marker that begins another scan.
+  (But it reads no input if jpeg_consume_input() has already reached the
+  end of the file or a SOS marker beyond the target output scan.)
+* jpeg_finish_decompress() will read until the end of file, and thus can
+  suspend if the end hasn't already been reached (as can be tested by
+  calling jpeg_input_complete()).
+jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
+all return TRUE if they completed their tasks, FALSE if they had to suspend.
+In the event of a FALSE return, the application must load more input data
+and repeat the call.  Applications that use non-suspending data sources need
+not check the return values of these three routines.
+
+
+It is possible to change decoding parameters between output passes in the
+buffered-image mode.  The decoder library currently supports only very
+limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
+allowed after jpeg_start_decompress() is called:
+* dct_method can be changed before each call to jpeg_start_output().
+  For example, one could use a fast DCT method for early scans, changing
+  to a higher quality method for the final scan.
+* dither_mode can be changed before each call to jpeg_start_output();
+  of course this has no impact if not using color quantization.  Typically
+  one would use ordered dither for initial passes, then switch to
+  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
+  can cause more memory to be allocated by the library.  Although the amount
+  of memory involved is not large (a scanline or so), it may cause the
+  initial max_memory_to_use specification to be exceeded, which in the worst
+  case would result in an out-of-memory failure.
+* do_block_smoothing can be changed before each call to jpeg_start_output().
+  This setting is relevant only when decoding a progressive JPEG image.
+  During the first DC-only scan, block smoothing provides a very "fuzzy" look
+  instead of the very "blocky" look seen without it; which is better seems a
+  matter of personal taste.  But block smoothing is nearly always a win
+  during later stages, especially when decoding a successive-approximation
+  image: smoothing helps to hide the slight blockiness that otherwise shows
+  up on smooth gradients until the lowest coefficient bits are sent.
+* Color quantization mode can be changed under the rules described below.
+  You *cannot* change between full-color and quantized output (because that
+  would alter the required I/O buffer sizes), but you can change which
+  quantization method is used.
+
+When generating color-quantized output, changing quantization method is a
+very useful way of switching between high-speed and high-quality display.
+The library allows you to change among its three quantization methods:
+1. Single-pass quantization to a fixed color cube.
+   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
+2. Single-pass quantization to an application-supplied colormap.
+   Selected by setting cinfo.colormap to point to the colormap (the value of
+   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
+3. Two-pass quantization to a colormap chosen specifically for the image.
+   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
+   (This is the default setting selected by jpeg_read_header, but it is
+   probably NOT what you want for the first pass of progressive display!)
+These methods offer successively better quality and lesser speed.  However,
+only the first method is available for quantizing in non-RGB color spaces.
+
+IMPORTANT: because the different quantizer methods have very different
+working-storage requirements, the library requires you to indicate which
+one(s) you intend to use before you call jpeg_start_decompress().  (If we did
+not require this, the max_memory_to_use setting would be a complete fiction.)
+You do this by setting one or more of these three cinfo fields to TRUE:
+	enable_1pass_quant		Fixed color cube colormap
+	enable_external_quant		Externally-supplied colormap
+	enable_2pass_quant		Two-pass custom colormap
+All three are initialized FALSE by jpeg_read_header().  But
+jpeg_start_decompress() automatically sets TRUE the one selected by the
+current two_pass_quantize and colormap settings, so you only need to set the
+enable flags for any other quantization methods you plan to change to later.
+
+After setting the enable flags correctly at jpeg_start_decompress() time, you
+can change to any enabled quantization method by setting two_pass_quantize
+and colormap properly just before calling jpeg_start_output().  The following
+special rules apply:
+1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
+   or 2-pass mode from a different mode, or when you want the 2-pass
+   quantizer to be re-run to generate a new colormap.
+2. To switch to an external colormap, or to change to a different external
+   colormap than was used on the prior pass, you must call
+   jpeg_new_colormap() after setting cinfo.colormap.
+NOTE: if you want to use the same colormap as was used in the prior pass,
+you should not do either of these things.  This will save some nontrivial
+switchover costs.
+(These requirements exist because cinfo.colormap will always be non-NULL
+after completing a prior output pass, since both the 1-pass and 2-pass
+quantizers set it to point to their output colormaps.  Thus you have to
+do one of these two things to notify the library that something has changed.
+Yup, it's a bit klugy, but it's necessary to do it this way for backwards
+compatibility.)
+
+Note that in buffered-image mode, the library generates any requested colormap
+during jpeg_start_output(), not during jpeg_start_decompress().
+
+When using two-pass quantization, jpeg_start_output() makes a pass over the
+buffered image to determine the optimum color map; it therefore may take a
+significant amount of time, whereas ordinarily it does little work.  The
+progress monitor hook is called during this pass, if defined.  It is also
+important to realize that if the specified target scan number is greater than
+or equal to the current input scan number, jpeg_start_output() will attempt
+to consume input as it makes this pass.  If you use a suspending data source,
+you need to check for a FALSE return from jpeg_start_output() under these
+conditions.  The combination of 2-pass quantization and a not-yet-fully-read
+target scan is the only case in which jpeg_start_output() will consume input.
+
+
+Application authors who support buffered-image mode may be tempted to use it
+for all JPEG images, even single-scan ones.  This will work, but it is
+inefficient: there is no need to create an image-sized coefficient buffer for
+single-scan images.  Requesting buffered-image mode for such an image wastes
+memory.  Worse, it can cost time on large images, since the buffered data has
+to be swapped out or written to a temporary file.  If you are concerned about
+maximum performance on baseline JPEG files, you should use buffered-image
+mode only when the incoming file actually has multiple scans.  This can be
+tested by calling jpeg_has_multiple_scans(), which will return a correct
+result at any time after jpeg_read_header() completes.
+
+It is also worth noting that when you use jpeg_consume_input() to let input
+processing get ahead of output processing, the resulting pattern of access to
+the coefficient buffer is quite nonsequential.  It's best to use the memory
+manager jmemnobs.c if you can (ie, if you have enough real or virtual main
+memory).  If not, at least make sure that max_memory_to_use is set as high as
+possible.  If the JPEG memory manager has to use a temporary file, you will
+probably see a lot of disk traffic and poor performance.  (This could be
+improved with additional work on the memory manager, but we haven't gotten
+around to it yet.)
+
+In some applications it may be convenient to use jpeg_consume_input() for all
+input processing, including reading the initial markers; that is, you may
+wish to call jpeg_consume_input() instead of jpeg_read_header() during
+startup.  This works, but note that you must check for JPEG_REACHED_SOS and
+JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
+Once the first SOS marker has been reached, you must call
+jpeg_start_decompress() before jpeg_consume_input() will consume more input;
+it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
+tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
+without ever returning JPEG_REACHED_SOS; be sure to check for this case.
+If this happens, the decompressor will not read any more input until you call
+jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
+using buffered-image mode, but in that case it's basically a no-op after the
+initial markers have been read: it will just return JPEG_SUSPENDED.
+
+
+Abbreviated datastreams and multiple images
+-------------------------------------------
+
+A JPEG compression or decompression object can be reused to process multiple
+images.  This saves a small amount of time per image by eliminating the
+"create" and "destroy" operations, but that isn't the real purpose of the
+feature.  Rather, reuse of an object provides support for abbreviated JPEG
+datastreams.  Object reuse can also simplify processing a series of images in
+a single input or output file.  This section explains these features.
+
+A JPEG file normally contains several hundred bytes worth of quantization
+and Huffman tables.  In a situation where many images will be stored or
+transmitted with identical tables, this may represent an annoying overhead.
+The JPEG standard therefore permits tables to be omitted.  The standard
+defines three classes of JPEG datastreams:
+  * "Interchange" datastreams contain an image and all tables needed to decode
+     the image.  These are the usual kind of JPEG file.
+  * "Abbreviated image" datastreams contain an image, but are missing some or
+    all of the tables needed to decode that image.
+  * "Abbreviated table specification" (henceforth "tables-only") datastreams
+    contain only table specifications.
+To decode an abbreviated image, it is necessary to load the missing table(s)
+into the decoder beforehand.  This can be accomplished by reading a separate
+tables-only file.  A variant scheme uses a series of images in which the first
+image is an interchange (complete) datastream, while subsequent ones are
+abbreviated and rely on the tables loaded by the first image.  It is assumed
+that once the decoder has read a table, it will remember that table until a
+new definition for the same table number is encountered.
+
+It is the application designer's responsibility to figure out how to associate
+the correct tables with an abbreviated image.  While abbreviated datastreams
+can be useful in a closed environment, their use is strongly discouraged in
+any situation where data exchange with other applications might be needed.
+Caveat designer.
+
+The JPEG library provides support for reading and writing any combination of
+tables-only datastreams and abbreviated images.  In both compression and
+decompression objects, a quantization or Huffman table will be retained for
+the lifetime of the object, unless it is overwritten by a new table definition.
+
+
+To create abbreviated image datastreams, it is only necessary to tell the
+compressor not to emit some or all of the tables it is using.  Each
+quantization and Huffman table struct contains a boolean field "sent_table",
+which normally is initialized to FALSE.  For each table used by the image, the
+header-writing process emits the table and sets sent_table = TRUE unless it is
+already TRUE.  (In normal usage, this prevents outputting the same table
+definition multiple times, as would otherwise occur because the chroma
+components typically share tables.)  Thus, setting this field to TRUE before
+calling jpeg_start_compress() will prevent the table from being written at
+all.
+
+If you want to create a "pure" abbreviated image file containing no tables,
+just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
+tables.  If you want to emit some but not all tables, you'll need to set the
+individual sent_table fields directly.
+
+To create an abbreviated image, you must also call jpeg_start_compress()
+with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
+will force all the sent_table fields to FALSE.  (This is a safety feature to
+prevent abbreviated images from being created accidentally.)
+
+To create a tables-only file, perform the same parameter setup that you
+normally would, but instead of calling jpeg_start_compress() and so on, call
+jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
+containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
+and Huffman tables that are currently defined in the compression object will
+be emitted unless their sent_tables flag is already TRUE, and then all the
+sent_tables flags will be set TRUE.
+
+A sure-fire way to create matching tables-only and abbreviated image files
+is to proceed as follows:
+
+	create JPEG compression object
+	set JPEG parameters
+	set destination to tables-only file
+	jpeg_write_tables(&cinfo);
+	set destination to image file
+	jpeg_start_compress(&cinfo, FALSE);
+	write data...
+	jpeg_finish_compress(&cinfo);
+
+Since the JPEG parameters are not altered between writing the table file and
+the abbreviated image file, the same tables are sure to be used.  Of course,
+you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
+many times to produce many abbreviated image files matching the table file.
+
+You cannot suppress output of the computed Huffman tables when Huffman
+optimization is selected.  (If you could, there'd be no way to decode the
+image...)  Generally, you don't want to set optimize_coding = TRUE when
+you are trying to produce abbreviated files.
+
+In some cases you might want to compress an image using tables which are
+not stored in the application, but are defined in an interchange or
+tables-only file readable by the application.  This can be done by setting up
+a JPEG decompression object to read the specification file, then copying the
+tables into your compression object.  See jpeg_copy_critical_parameters()
+for an example of copying quantization tables.
+
+
+To read abbreviated image files, you simply need to load the proper tables
+into the decompression object before trying to read the abbreviated image.
+If the proper tables are stored in the application program, you can just
+allocate the table structs and fill in their contents directly.  For example,
+to load a fixed quantization table into table slot "n":
+
+    if (cinfo.quant_tbl_ptrs[n] == NULL)
+      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
+    quant_ptr = cinfo.quant_tbl_ptrs[n];	/* quant_ptr is JQUANT_TBL* */
+    for (i = 0; i < 64; i++) {
+      /* Qtable[] is desired quantization table, in natural array order */
+      quant_ptr->quantval[i] = Qtable[i];
+    }
+
+Code to load a fixed Huffman table is typically (for AC table "n"):
+
+    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
+      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
+    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];	/* huff_ptr is JHUFF_TBL* */
+    for (i = 1; i <= 16; i++) {
+      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
+      huff_ptr->bits[i] = counts[i];
+    }
+    for (i = 0; i < 256; i++) {
+      /* symbols[] is the list of Huffman symbols, in code-length order */
+      huff_ptr->huffval[i] = symbols[i];
+    }
+
+(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
+constant JQUANT_TBL object is not safe.  If the incoming file happened to
+contain a quantization table definition, your master table would get
+overwritten!  Instead allocate a working table copy and copy the master table
+into it, as illustrated above.  Ditto for Huffman tables, of course.)
+
+You might want to read the tables from a tables-only file, rather than
+hard-wiring them into your application.  The jpeg_read_header() call is
+sufficient to read a tables-only file.  You must pass a second parameter of
+FALSE to indicate that you do not require an image to be present.  Thus, the
+typical scenario is
+
+	create JPEG decompression object
+	set source to tables-only file
+	jpeg_read_header(&cinfo, FALSE);
+	set source to abbreviated image file
+	jpeg_read_header(&cinfo, TRUE);
+	set decompression parameters
+	jpeg_start_decompress(&cinfo);
+	read data...
+	jpeg_finish_decompress(&cinfo);
+
+In some cases, you may want to read a file without knowing whether it contains
+an image or just tables.  In that case, pass FALSE and check the return value
+from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
+JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
+JPEG_SUSPENDED, is possible when using a suspending data source manager.)
+Note that jpeg_read_header() will not complain if you read an abbreviated
+image for which you haven't loaded the missing tables; the missing-table check
+occurs later, in jpeg_start_decompress().
+
+
+It is possible to read a series of images from a single source file by
+repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
+without releasing/recreating the JPEG object or the data source module.
+(If you did reinitialize, any partial bufferload left in the data source
+buffer at the end of one image would be discarded, causing you to lose the
+start of the next image.)  When you use this method, stored tables are
+automatically carried forward, so some of the images can be abbreviated images
+that depend on tables from earlier images.
+
+If you intend to write a series of images into a single destination file,
+you might want to make a specialized data destination module that doesn't
+flush the output buffer at term_destination() time.  This would speed things
+up by some trifling amount.  Of course, you'd need to remember to flush the
+buffer after the last image.  You can make the later images be abbreviated
+ones by passing FALSE to jpeg_start_compress().
+
+
+Special markers
+---------------
+
+Some applications may need to insert or extract special data in the JPEG
+datastream.  The JPEG standard provides marker types "COM" (comment) and
+"APP0" through "APP15" (application) to hold application-specific data.
+Unfortunately, the use of these markers is not specified by the standard.
+COM markers are fairly widely used to hold user-supplied text.  The JFIF file
+format spec uses APP0 markers with specified initial strings to hold certain
+data.  Adobe applications use APP14 markers beginning with the string "Adobe"
+for miscellaneous data.  Other APPn markers are rarely seen, but might
+contain almost anything.
+
+If you wish to store user-supplied text, we recommend you use COM markers
+and place readable 7-bit ASCII text in them.  Newline conventions are not
+standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
+(Mac style).  A robust COM reader should be able to cope with random binary
+garbage, including nulls, since some applications generate COM markers
+containing non-ASCII junk.  (But yours should not be one of them.)
+
+For program-supplied data, use an APPn marker, and be sure to begin it with an
+identifying string so that you can tell whether the marker is actually yours.
+It's probably best to avoid using APP0 or APP14 for any private markers.
+(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
+not use APP8 markers for any private purposes, either.)
+
+Keep in mind that at most 65533 bytes can be put into one marker, but you
+can have as many markers as you like.
+
+By default, the IJG compression library will write a JFIF APP0 marker if the
+selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
+the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
+we don't recommend it.  The decompression library will recognize JFIF and
+Adobe markers and will set the JPEG colorspace properly when one is found.
+
+
+You can write special markers immediately following the datastream header by
+calling jpeg_write_marker() after jpeg_start_compress() and before the first
+call to jpeg_write_scanlines().  When you do this, the markers appear after
+the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
+all else.  Specify the marker type parameter as "JPEG_COM" for COM or
+"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
+any marker type, but we don't recommend writing any other kinds of marker.)
+For example, to write a user comment string pointed to by comment_text:
+	jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
+
+If it's not convenient to store all the marker data in memory at once,
+you can instead call jpeg_write_m_header() followed by multiple calls to
+jpeg_write_m_byte().  If you do it this way, it's your responsibility to
+call jpeg_write_m_byte() exactly the number of times given in the length
+parameter to jpeg_write_m_header().  (This method lets you empty the
+output buffer partway through a marker, which might be important when
+using a suspending data destination module.  In any case, if you are using
+a suspending destination, you should flush its buffer after inserting
+any special markers.  See "I/O suspension".)
+
+Or, if you prefer to synthesize the marker byte sequence yourself,
+you can just cram it straight into the data destination module.
+
+If you are writing JFIF 1.02 extension markers (thumbnail images), don't
+forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
+correct JFIF version number in the JFIF header marker.  The library's default
+is to write version 1.01, but that's wrong if you insert any 1.02 extension
+markers.  (We could probably get away with just defaulting to 1.02, but there
+used to be broken decoders that would complain about unknown minor version
+numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
+you are actually using 1.02 extensions.)
+
+
+When reading, two methods of handling special markers are available:
+1. You can ask the library to save the contents of COM and/or APPn markers
+into memory, and then examine them at your leisure afterwards.
+2. You can supply your own routine to process COM and/or APPn markers
+on-the-fly as they are read.
+The first method is simpler to use, especially if you are using a suspending
+data source; writing a marker processor that copes with input suspension is
+not easy (consider what happens if the marker is longer than your available
+input buffer).  However, the second method conserves memory since the marker
+data need not be kept around after it's been processed.
+
+For either method, you'd normally set up marker handling after creating a
+decompression object and before calling jpeg_read_header(), because the
+markers of interest will typically be near the head of the file and so will
+be scanned by jpeg_read_header.  Once you've established a marker handling
+method, it will be used for the life of that decompression object
+(potentially many datastreams), unless you change it.  Marker handling is
+determined separately for COM markers and for each APPn marker code.
+
+
+To save the contents of special markers in memory, call
+	jpeg_save_markers(cinfo, marker_code, length_limit)
+where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
+(To arrange to save all the special marker types, you need to call this
+routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
+than length_limit data bytes, only length_limit bytes will be saved; this
+parameter allows you to avoid chewing up memory when you only need to see the
+first few bytes of a potentially large marker.  If you want to save all the
+data, set length_limit to 0xFFFF; that is enough since marker lengths are only
+16 bits.  As a special case, setting length_limit to 0 prevents that marker
+type from being saved at all.  (That is the default behavior, in fact.)
+
+After jpeg_read_header() completes, you can examine the special markers by
+following the cinfo->marker_list pointer chain.  All the special markers in
+the file appear in this list, in order of their occurrence in the file (but
+omitting any markers of types you didn't ask for).  Both the original data
+length and the saved data length are recorded for each list entry; the latter
+will not exceed length_limit for the particular marker type.  Note that these
+lengths exclude the marker length word, whereas the stored representation
+within the JPEG file includes it.  (Hence the maximum data length is really
+only 65533.)
+
+It is possible that additional special markers appear in the file beyond the
+SOS marker at which jpeg_read_header stops; if so, the marker list will be
+extended during reading of the rest of the file.  This is not expected to be
+common, however.  If you are short on memory you may want to reset the length
+limit to zero for all marker types after finishing jpeg_read_header, to
+ensure that the max_memory_to_use setting cannot be exceeded due to addition
+of later markers.
+
+The marker list remains stored until you call jpeg_finish_decompress or
+jpeg_abort, at which point the memory is freed and the list is set to empty.
+(jpeg_destroy also releases the storage, of course.)
+
+Note that the library is internally interested in APP0 and APP14 markers;
+if you try to set a small nonzero length limit on these types, the library
+will silently force the length up to the minimum it wants.  (But you can set
+a zero length limit to prevent them from being saved at all.)  Also, in a
+16-bit environment, the maximum length limit may be constrained to less than
+65533 by malloc() limitations.  It is therefore best not to assume that the
+effective length limit is exactly what you set it to be.
+
+
+If you want to supply your own marker-reading routine, you do it by calling
+jpeg_set_marker_processor().  A marker processor routine must have the
+signature
+	boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
+Although the marker code is not explicitly passed, the routine can find it
+in cinfo->unread_marker.  At the time of call, the marker proper has been
+read from the data source module.  The processor routine is responsible for
+reading the marker length word and the remaining parameter bytes, if any.
+Return TRUE to indicate success.  (FALSE should be returned only if you are
+using a suspending data source and it tells you to suspend.  See the standard
+marker processors in jdmarker.c for appropriate coding methods if you need to
+use a suspending data source.)
+
+If you override the default APP0 or APP14 processors, it is up to you to
+recognize JFIF and Adobe markers if you want colorspace recognition to occur
+properly.  We recommend copying and extending the default processors if you
+want to do that.  (A better idea is to save these marker types for later
+examination by calling jpeg_save_markers(); that method doesn't interfere
+with the library's own processing of these markers.)
+
+jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
+--- if you call one it overrides any previous call to the other, for the
+particular marker type specified.
+
+A simple example of an external COM processor can be found in djpeg.c.
+Also, see jpegtran.c for an example of using jpeg_save_markers.
+
+
+Raw (downsampled) image data
+----------------------------
+
+Some applications need to supply already-downsampled image data to the JPEG
+compressor, or to receive raw downsampled data from the decompressor.  The
+library supports this requirement by allowing the application to write or
+read raw data, bypassing the normal preprocessing or postprocessing steps.
+The interface is different from the standard one and is somewhat harder to
+use.  If your interest is merely in bypassing color conversion, we recommend
+that you use the standard interface and simply set jpeg_color_space =
+in_color_space (or jpeg_color_space = out_color_space for decompression).
+The mechanism described in this section is necessary only to supply or
+receive downsampled image data, in which not all components have the same
+dimensions.
+
+
+To compress raw data, you must supply the data in the colorspace to be used
+in the JPEG file (please read the earlier section on Special color spaces)
+and downsampled to the sampling factors specified in the JPEG parameters.
+You must supply the data in the format used internally by the JPEG library,
+namely a JSAMPIMAGE array.  This is an array of pointers to two-dimensional
+arrays, each of type JSAMPARRAY.  Each 2-D array holds the values for one
+color component.  This structure is necessary since the components are of
+different sizes.  If the image dimensions are not a multiple of the MCU size,
+you must also pad the data correctly (usually, this is done by replicating
+the last column and/or row).  The data must be padded to a multiple of a DCT
+block in each component: that is, each downsampled row must contain a
+multiple of 8 valid samples, and there must be a multiple of 8 sample rows
+for each component.  (For applications such as conversion of digital TV
+images, the standard image size is usually a multiple of the DCT block size,
+so that no padding need actually be done.)
+
+The procedure for compression of raw data is basically the same as normal
+compression, except that you call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  Before calling jpeg_start_compress(), you must do
+the following:
+  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
+    This notifies the library that you will be supplying raw data.
+    Furthermore, set cinfo->do_fancy_downsampling to FALSE if you want to use
+    real downsampled data.  (It is set TRUE by jpeg_set_defaults().)
+  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
+    call is a good idea.  Note that since color conversion is bypassed,
+    in_color_space is ignored, except that jpeg_set_defaults() uses it to
+    choose the default jpeg_color_space setting.
+  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
+    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
+    dimensions of the data you are supplying, it's wise to set them
+    explicitly, rather than assuming the library's defaults are what you want.
+
+To pass raw data to the library, call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  The two routines work similarly except that
+jpeg_write_raw_data takes a JSAMPIMAGE data array rather than JSAMPARRAY.
+The scanlines count passed to and returned from jpeg_write_raw_data is
+measured in terms of the component with the largest v_samp_factor.
+
+jpeg_write_raw_data() processes one MCU row per call, which is to say
+v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
+value must be at least max_v_samp_factor*DCTSIZE, and the return value will
+be exactly that amount (or possibly some multiple of that amount, in future
+library versions).  This is true even on the last call at the bottom of the
+image; don't forget to pad your data as necessary.
+
+The required dimensions of the supplied data can be computed for each
+component as
+	cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
+	cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
+after jpeg_start_compress() has initialized those fields.  If the valid data
+is smaller than this, it must be padded appropriately.  For some sampling
+factors and image sizes, additional dummy DCT blocks are inserted to make
+the image a multiple of the MCU dimensions.  The library creates such dummy
+blocks itself; it does not read them from your supplied data.  Therefore you
+need never pad by more than DCTSIZE samples.  An example may help here.
+Assume 2h2v downsampling of YCbCr data, that is
+	cinfo->comp_info[0].h_samp_factor = 2		for Y
+	cinfo->comp_info[0].v_samp_factor = 2
+	cinfo->comp_info[1].h_samp_factor = 1		for Cb
+	cinfo->comp_info[1].v_samp_factor = 1
+	cinfo->comp_info[2].h_samp_factor = 1		for Cr
+	cinfo->comp_info[2].v_samp_factor = 1
+and suppose that the nominal image dimensions (cinfo->image_width and
+cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
+compute downsampled_width = 101 and width_in_blocks = 13 for Y,
+downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
+for the height fields).  You must pad the Y data to at least 13*8 = 104
+columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
+MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
+scanlines on each call to jpeg_write_raw_data(), which is to say 16 actual
+sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
+so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
+of Y data is dummy, so it doesn't matter what you pass for it in the data
+arrays, but the scanlines count must total up to 112 so that all of the Cb
+and Cr data gets passed.
+
+Output suspension is supported with raw-data compression: if the data
+destination module suspends, jpeg_write_raw_data() will return 0.
+In this case the same data rows must be passed again on the next call.
+
+
+Decompression with raw data output implies bypassing all postprocessing.
+You must deal with the color space and sampling factors present in the
+incoming file.  If your application only handles, say, 2h1v YCbCr data,
+you must check for and fail on other color spaces or other sampling factors.
+The library will not convert to a different color space for you.
+
+To obtain raw data output, set cinfo->raw_data_out = TRUE before
+jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
+verify that the color space and sampling factors are ones you can handle.
+Furthermore, set cinfo->do_fancy_upsampling = FALSE if you want to get real
+downsampled data (it is set TRUE by jpeg_read_header()).
+Then call jpeg_read_raw_data() in place of jpeg_read_scanlines().  The
+decompression process is otherwise the same as usual.
+
+jpeg_read_raw_data() returns one MCU row per call, and thus you must pass a
+buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
+the same as for raw-data compression).  The buffer you pass must be large
+enough to hold the actual data plus padding to DCT-block boundaries.  As with
+compression, any entirely dummy DCT blocks are not processed so you need not
+allocate space for them, but the total scanline count includes them.  The
+above example of computing buffer dimensions for raw-data compression is
+equally valid for decompression.
+
+Input suspension is supported with raw-data decompression: if the data source
+module suspends, jpeg_read_raw_data() will return 0.  You can also use
+buffered-image mode to read raw data in multiple passes.
+
+
+Really raw data: DCT coefficients
+---------------------------------
+
+It is possible to read or write the contents of a JPEG file as raw DCT
+coefficients.  This facility is mainly intended for use in lossless
+transcoding between different JPEG file formats.  Other possible applications
+include lossless cropping of a JPEG image, lossless reassembly of a
+multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
+
+To read the contents of a JPEG file as DCT coefficients, open the file and do
+jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
+and jpeg_read_scanlines(), call jpeg_read_coefficients().  This will read the
+entire image into a set of virtual coefficient-block arrays, one array per
+component.  The return value is a pointer to an array of virtual-array
+descriptors.  Each virtual array can be accessed directly using the JPEG
+memory manager's access_virt_barray method (see Memory management, below,
+and also read structure.txt's discussion of virtual array handling).  Or,
+for simple transcoding to a different JPEG file format, the array list can
+just be handed directly to jpeg_write_coefficients().
+
+Each block in the block arrays contains quantized coefficient values in
+normal array order (not JPEG zigzag order).  The block arrays contain only
+DCT blocks containing real data; any entirely-dummy blocks added to fill out
+interleaved MCUs at the right or bottom edges of the image are discarded
+during reading and are not stored in the block arrays.  (The size of each
+block array can be determined from the width_in_blocks and height_in_blocks
+fields of the component's comp_info entry.)  This is also the data format
+expected by jpeg_write_coefficients().
+
+When you are done using the virtual arrays, call jpeg_finish_decompress()
+to release the array storage and return the decompression object to an idle
+state; or just call jpeg_destroy() if you don't need to reuse the object.
+
+If you use a suspending data source, jpeg_read_coefficients() will return
+NULL if it is forced to suspend; a non-NULL return value indicates successful
+completion.  You need not test for a NULL return value when using a
+non-suspending data source.
+
+It is also possible to call jpeg_read_coefficients() to obtain access to the
+decoder's coefficient arrays during a normal decode cycle in buffered-image
+mode.  This frammish might be useful for progressively displaying an incoming
+image and then re-encoding it without loss.  To do this, decode in buffered-
+image mode as discussed previously, then call jpeg_read_coefficients() after
+the last jpeg_finish_output() call.  The arrays will be available for your use
+until you call jpeg_finish_decompress().
+
+
+To write the contents of a JPEG file as DCT coefficients, you must provide
+the DCT coefficients stored in virtual block arrays.  You can either pass
+block arrays read from an input JPEG file by jpeg_read_coefficients(), or
+allocate virtual arrays from the JPEG compression object and fill them
+yourself.  In either case, jpeg_write_coefficients() is substituted for
+jpeg_start_compress() and jpeg_write_scanlines().  Thus the sequence is
+  * Create compression object
+  * Set all compression parameters as necessary
+  * Request virtual arrays if needed
+  * jpeg_write_coefficients()
+  * jpeg_finish_compress()
+  * Destroy or re-use compression object
+jpeg_write_coefficients() is passed a pointer to an array of virtual block
+array descriptors; the number of arrays is equal to cinfo.num_components.
+
+The virtual arrays need only have been requested, not realized, before
+jpeg_write_coefficients() is called.  A side-effect of
+jpeg_write_coefficients() is to realize any virtual arrays that have been
+requested from the compression object's memory manager.  Thus, when obtaining
+the virtual arrays from the compression object, you should fill the arrays
+after calling jpeg_write_coefficients().  The data is actually written out
+when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
+the file header.
+
+When writing raw DCT coefficients, it is crucial that the JPEG quantization
+tables and sampling factors match the way the data was encoded, or the
+resulting file will be invalid.  For transcoding from an existing JPEG file,
+we recommend using jpeg_copy_critical_parameters().  This routine initializes
+all the compression parameters to default values (like jpeg_set_defaults()),
+then copies the critical information from a source decompression object.
+The decompression object should have just been used to read the entire
+JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
+
+jpeg_write_coefficients() marks all tables stored in the compression object
+as needing to be written to the output file (thus, it acts like
+jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
+emitting abbreviated JPEG files by accident.  If you really want to emit an
+abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
+individual sent_table flags, between calling jpeg_write_coefficients() and
+jpeg_finish_compress().
+
+
+Progress monitoring
+-------------------
+
+Some applications may need to regain control from the JPEG library every so
+often.  The typical use of this feature is to produce a percent-done bar or
+other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
+Although you do get control back frequently during the data-transferring pass
+(the jpeg_read_scanlines or jpeg_write_scanlines loop), any additional passes
+will occur inside jpeg_finish_compress or jpeg_start_decompress; those
+routines may take a long time to execute, and you don't get control back
+until they are done.
+
+You can define a progress-monitor routine which will be called periodically
+by the library.  No guarantees are made about how often this call will occur,
+so we don't recommend you use it for mouse tracking or anything like that.
+At present, a call will occur once per MCU row, scanline, or sample row
+group, whichever unit is convenient for the current processing mode; so the
+wider the image, the longer the time between calls.  During the data
+transferring pass, only one call occurs per call of jpeg_read_scanlines or
+jpeg_write_scanlines, so don't pass a large number of scanlines at once if
+you want fine resolution in the progress count.  (If you really need to use
+the callback mechanism for time-critical tasks like mouse tracking, you could
+insert additional calls inside some of the library's inner loops.)
+
+To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
+fill in its progress_monitor field with a pointer to your callback routine,
+and set cinfo->progress to point to the struct.  The callback will be called
+whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
+jpeg_create_compress or jpeg_create_decompress; the library will not change
+it thereafter.  So if you allocate dynamic storage for the progress struct,
+make sure it will live as long as the JPEG object does.  Allocating from the
+JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
+can use the same callback routine for both compression and decompression.
+
+The jpeg_progress_mgr struct contains four fields which are set by the library:
+	long pass_counter;	/* work units completed in this pass */
+	long pass_limit;	/* total number of work units in this pass */
+	int completed_passes;	/* passes completed so far */
+	int total_passes;	/* total number of passes expected */
+During any one pass, pass_counter increases from 0 up to (not including)
+pass_limit; the step size is usually but not necessarily 1.  The pass_limit
+value may change from one pass to another.  The expected total number of
+passes is in total_passes, and the number of passes already completed is in
+completed_passes.  Thus the fraction of work completed may be estimated as
+		completed_passes + (pass_counter/pass_limit)
+		--------------------------------------------
+				total_passes
+ignoring the fact that the passes may not be equal amounts of work.
+
+When decompressing, pass_limit can even change within a pass, because it
+depends on the number of scans in the JPEG file, which isn't always known in
+advance.  The computed fraction-of-work-done may jump suddenly (if the library
+discovers it has overestimated the number of scans) or even decrease (in the
+opposite case).  It is not wise to put great faith in the work estimate.
+
+When using the decompressor's buffered-image mode, the progress monitor work
+estimate is likely to be completely unhelpful, because the library has no way
+to know how many output passes will be demanded of it.  Currently, the library
+sets total_passes based on the assumption that there will be one more output
+pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
+TRUE), but no more output passes if the file end has been reached when the
+output pass is started.  This means that total_passes will rise as additional
+output passes are requested.  If you have a way of determining the input file
+size, estimating progress based on the fraction of the file that's been read
+will probably be more useful than using the library's value.
+
+
+Memory management
+-----------------
+
+This section covers some key facts about the JPEG library's built-in memory
+manager.  For more info, please read structure.txt's section about the memory
+manager, and consult the source code if necessary.
+
+All memory and temporary file allocation within the library is done via the
+memory manager.  If necessary, you can replace the "back end" of the memory
+manager to control allocation yourself (for example, if you don't want the
+library to use malloc() and free() for some reason).
+
+Some data is allocated "permanently" and will not be freed until the JPEG
+object is destroyed.  Most data is allocated "per image" and is freed by
+jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
+memory manager yourself to allocate structures that will automatically be
+freed at these times.  Typical code for this is
+  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, size);
+Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
+Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
+There are also alloc_sarray and alloc_barray routines that automatically
+build 2-D sample or block arrays.
+
+The library's minimum space requirements to process an image depend on the
+image's width, but not on its height, because the library ordinarily works
+with "strip" buffers that are as wide as the image but just a few rows high.
+Some operating modes (eg, two-pass color quantization) require full-image
+buffers.  Such buffers are treated as "virtual arrays": only the current strip
+need be in memory, and the rest can be swapped out to a temporary file.
+
+If you use the simplest memory manager back end (jmemnobs.c), then no
+temporary files are used; virtual arrays are simply malloc()'d.  Images bigger
+than memory can be processed only if your system supports virtual memory.
+The other memory manager back ends support temporary files of various flavors
+and thus work in machines without virtual memory.  They may also be useful on
+Unix machines if you need to process images that exceed available swap space.
+
+When using temporary files, the library will make the in-memory buffers for
+its virtual arrays just big enough to stay within a "maximum memory" setting.
+Your application can set this limit by setting cinfo->mem->max_memory_to_use
+after creating the JPEG object.  (Of course, there is still a minimum size for
+the buffers, so the max-memory setting is effective only if it is bigger than
+the minimum space needed.)  If you allocate any large structures yourself, you
+must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
+order to have them counted against the max memory limit.  Also keep in mind
+that space allocated with alloc_small() is ignored, on the assumption that
+it's too small to be worth worrying about; so a reasonable safety margin
+should be left when setting max_memory_to_use.
+
+If you use the jmemname.c or jmemdos.c memory manager back end, it is
+important to clean up the JPEG object properly to ensure that the temporary
+files get deleted.  (This is especially crucial with jmemdos.c, where the
+"temporary files" may be extended-memory segments; if they are not freed,
+DOS will require a reboot to recover the memory.)  Thus, with these memory
+managers, it's a good idea to provide a signal handler that will trap any
+early exit from your program.  The handler should call either jpeg_abort()
+or jpeg_destroy() for any active JPEG objects.  A handler is not needed with
+jmemnobs.c, and shouldn't be necessary with jmemansi.c or jmemmac.c either,
+since the C library is supposed to take care of deleting files made with
+tmpfile().
+
+
+Memory usage
+------------
+
+Working memory requirements while performing compression or decompression
+depend on image dimensions, image characteristics (such as colorspace and
+JPEG process), and operating mode (application-selected options).
+
+As of v6b, the decompressor requires:
+ 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
+    on operating mode and image characteristics (particularly color vs.
+    grayscale), but it doesn't depend on image dimensions.
+ 2. Strip buffers (of size proportional to the image width) for IDCT and
+    upsampling results.  The worst case for commonly used sampling factors
+    is about 34 bytes * width in pixels for a color image.  A grayscale image
+    only needs about 8 bytes per pixel column.
+ 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
+    file (including progressive JPEGs), or whenever you select buffered-image
+    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
+    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
+    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
+ 4. To perform 2-pass color quantization, the decompressor also needs a
+    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
+This does not count any memory allocated by the application, such as a
+buffer to hold the final output image.
+
+The above figures are valid for 8-bit JPEG data precision and a machine with
+32-bit ints.  For 12-bit JPEG data, double the size of the strip buffers and
+quantization pixel buffer.  The "fixed-size" data will be somewhat smaller
+with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
+color spaces will require different amounts of space.
+
+The full-image coefficient and pixel buffers, if needed at all, do not
+have to be fully RAM resident; you can have the library use temporary
+files instead when the total memory usage would exceed a limit you set.
+(But if your OS supports virtual memory, it's probably better to just use
+jmemnobs and let the OS do the swapping.)
+
+The compressor's memory requirements are similar, except that it has no need
+for color quantization.  Also, it needs a full-image DCT coefficient buffer
+if Huffman-table optimization is asked for, even if progressive mode is not
+requested.
+
+If you need more detailed information about memory usage in a particular
+situation, you can enable the MEM_STATS code in jmemmgr.c.
+
+
+Library compile-time options
+----------------------------
+
+A number of compile-time options are available by modifying jmorecfg.h.
+
+The JPEG standard provides for both the baseline 8-bit DCT process and
+a 12-bit DCT process.  The IJG code supports 12-bit lossy JPEG if you define
+BITS_IN_JSAMPLE as 12 rather than 8.  Note that this causes JSAMPLE to be
+larger than a char, so it affects the surrounding application's image data.
+The sample applications cjpeg and djpeg can support 12-bit mode only for PPM
+and GIF file formats; you must disable the other file formats to compile a
+12-bit cjpeg or djpeg.  (install.txt has more information about that.)
+At present, a 12-bit library can handle *only* 12-bit images, not both
+precisions.  (If you need to include both 8- and 12-bit libraries in a single
+application, you could probably do it by defining NEED_SHORT_EXTERNAL_NAMES
+for just one of the copies.  You'd have to access the 8-bit and 12-bit copies
+from separate application source files.  This is untested ... if you try it,
+we'd like to hear whether it works!)
+
+Note that a 12-bit library always compresses in Huffman optimization mode,
+in order to generate valid Huffman tables.  This is necessary because our
+default Huffman tables only cover 8-bit data.  If you need to output 12-bit
+files in one pass, you'll have to supply suitable default Huffman tables.
+You may also want to supply your own DCT quantization tables; the existing
+quality-scaling code has been developed for 8-bit use, and probably doesn't
+generate especially good tables for 12-bit.
+
+The maximum number of components (color channels) in the image is determined
+by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
+expect that few applications will need more than four or so.
+
+On machines with unusual data type sizes, you may be able to improve
+performance or reduce memory space by tweaking the various typedefs in
+jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
+is quite slow; consider trading memory for speed by making JCOEF, INT16, and
+UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
+You probably don't want to make JSAMPLE be int unless you have lots of memory
+to burn.
+
+You can reduce the size of the library by compiling out various optional
+functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
+
+You can also save a few K by not having text error messages in the library;
+the standard error message table occupies about 5Kb.  This is particularly
+reasonable for embedded applications where there's no good way to display 
+a message anyway.  To do this, remove the creation of the message table
+(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
+something reasonable without it.  You could output the numeric value of the
+message code number, for example.  If you do this, you can also save a couple
+more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
+you don't need trace capability anyway, right?
+
+
+Portability considerations
+--------------------------
+
+The JPEG library has been written to be extremely portable; the sample
+applications cjpeg and djpeg are slightly less so.  This section summarizes
+the design goals in this area.  (If you encounter any bugs that cause the
+library to be less portable than is claimed here, we'd appreciate hearing
+about them.)
+
+The code works fine on ANSI C, C++, and pre-ANSI C compilers, using any of
+the popular system include file setups, and some not-so-popular ones too.
+See install.txt for configuration procedures.
+
+The code is not dependent on the exact sizes of the C data types.  As
+distributed, we make the assumptions that
+	char	is at least 8 bits wide
+	short	is at least 16 bits wide
+	int	is at least 16 bits wide
+	long	is at least 32 bits wide
+(These are the minimum requirements of the ANSI C standard.)  Wider types will
+work fine, although memory may be used inefficiently if char is much larger
+than 8 bits or short is much bigger than 16 bits.  The code should work
+equally well with 16- or 32-bit ints.
+
+In a system where these assumptions are not met, you may be able to make the
+code work by modifying the typedefs in jmorecfg.h.  However, you will probably
+have difficulty if int is less than 16 bits wide, since references to plain
+int abound in the code.
+
+char can be either signed or unsigned, although the code runs faster if an
+unsigned char type is available.  If char is wider than 8 bits, you will need
+to redefine JOCTET and/or provide custom data source/destination managers so
+that JOCTET represents exactly 8 bits of data on external storage.
+
+The JPEG library proper does not assume ASCII representation of characters.
+But some of the image file I/O modules in cjpeg/djpeg do have ASCII
+dependencies in file-header manipulation; so does cjpeg's select_file_type()
+routine.
+
+The JPEG library does not rely heavily on the C library.  In particular, C
+stdio is used only by the data source/destination modules and the error
+handler, all of which are application-replaceable.  (cjpeg/djpeg are more
+heavily dependent on stdio.)  malloc and free are called only from the memory
+manager "back end" module, so you can use a different memory allocator by
+replacing that one file.
+
+The code generally assumes that C names must be unique in the first 15
+characters.  However, global function names can be made unique in the
+first 6 characters by defining NEED_SHORT_EXTERNAL_NAMES.
+
+More info about porting the code may be gleaned by reading jconfig.txt,
+jmorecfg.h, and jinclude.h.
+
+
+Notes for MS-DOS implementors
+-----------------------------
+
+The IJG code is designed to work efficiently in 80x86 "small" or "medium"
+memory models (i.e., data pointers are 16 bits unless explicitly declared
+"far"; code pointers can be either size).  You may be able to use small
+model to compile cjpeg or djpeg by itself, but you will probably have to use
+medium model for any larger application.  This won't make much difference in
+performance.  You *will* take a noticeable performance hit if you use a
+large-data memory model (perhaps 10%-25%), and you should avoid "huge" model
+if at all possible.
+
+The JPEG library typically needs 2Kb-3Kb of stack space.  It will also
+malloc about 20K-30K of near heap space while executing (and lots of far
+heap, but that doesn't count in this calculation).  This figure will vary
+depending on selected operating mode, and to a lesser extent on image size.
+There is also about 5Kb-6Kb of constant data which will be allocated in the
+near data segment (about 4Kb of this is the error message table).
+Thus you have perhaps 20K available for other modules' static data and near
+heap space before you need to go to a larger memory model.  The C library's
+static data will account for several K of this, but that still leaves a good
+deal for your needs.  (If you are tight on space, you could reduce the sizes
+of the I/O buffers allocated by jdatasrc.c and jdatadst.c, say from 4K to
+1K.  Another possibility is to move the error message table to far memory;
+this should be doable with only localized hacking on jerror.c.)
+
+About 2K of the near heap space is "permanent" memory that will not be
+released until you destroy the JPEG object.  This is only an issue if you
+save a JPEG object between compression or decompression operations.
+
+Far data space may also be a tight resource when you are dealing with large
+images.  The most memory-intensive case is decompression with two-pass color
+quantization, or single-pass quantization to an externally supplied color
+map.  This requires a 128Kb color lookup table plus strip buffers amounting
+to about 40 bytes per column for typical sampling ratios (eg, about 25600
+bytes for a 640-pixel-wide image).  You may not be able to process wide
+images if you have large data structures of your own.
+
+Of course, all of these concerns vanish if you use a 32-bit flat-memory-model
+compiler, such as DJGPP or Watcom C.  We highly recommend flat model if you
+can use it; the JPEG library is significantly faster in flat model.
diff --git a/jpeg/makedepend b/jpeg/makedepend
index f7f164973..78ba5373f 100644
--- a/jpeg/makedepend
+++ b/jpeg/makedepend
@@ -1,28 +1,26 @@
 # DO NOT DELETE
 
-jmemnobs.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemnobs.o: jmemsys.h
+jaricom.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapimin.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcarith.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccolor.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcdctmgr.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcdctmgr.o: jdct.h
 jchuff.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jchuff.o: jchuff.h
 jcinit.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcmainct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcmarker.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcmaster.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcomapi.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcparam.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jchuff.h
 jcprepct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcsample.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jctrans.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdarith.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatasrc.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdcoefct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -30,14 +28,11 @@ jdcolor.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jddctmgr.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jddctmgr.o: jdct.h
 jdhuff.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdhuff.o: jdhuff.h
 jdinput.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmerge.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdhuff.h
 jdpostct.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdsample.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdtrans.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -55,10 +50,10 @@ jidctfst.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jidctfst.o: jdct.h
 jidctint.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jidctint.o: jdct.h
-jidctred.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jidctred.o: jdct.h
+jmemmgr.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o: jmemsys.h
+jmemnobs.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemnobs.o: jmemsys.h
 jquant1.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemsys.h
diff --git a/jpeg/structure.doc b/jpeg/structure.doc
deleted file mode 100644
index 51c9def7e..000000000
--- a/jpeg/structure.doc
+++ /dev/null
@@ -1,948 +0,0 @@
-IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
-
-Copyright (C) 1991-1995, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-This file provides an overview of the architecture of the IJG JPEG software;
-that is, the functions of the various modules in the system and the interfaces
-between modules.  For more precise details about any data structure or calling
-convention, see the include files and comments in the source code.
-
-We assume that the reader is already somewhat familiar with the JPEG standard.
-The README file includes references for learning about JPEG.  The file
-libjpeg.doc describes the library from the viewpoint of an application
-programmer using the library; it's best to read that file before this one.
-Also, the file coderules.doc describes the coding style conventions we use.
-
-In this document, JPEG-specific terminology follows the JPEG standard:
-  A "component" means a color channel, e.g., Red or Luminance.
-  A "sample" is a single component value (i.e., one number in the image data).
-  A "coefficient" is a frequency coefficient (a DCT transform output number).
-  A "block" is an 8x8 group of samples or coefficients.
-  An "MCU" (minimum coded unit) is an interleaved set of blocks of size
-	determined by the sampling factors, or a single block in a
-	noninterleaved scan.
-We do not use the terms "pixel" and "sample" interchangeably.  When we say
-pixel, we mean an element of the full-size image, while a sample is an element
-of the downsampled image.  Thus the number of samples may vary across
-components while the number of pixels does not.  (This terminology is not used
-rigorously throughout the code, but it is used in places where confusion would
-otherwise result.)
-
-
-*** System features ***
-
-The IJG distribution contains two parts:
-  * A subroutine library for JPEG compression and decompression.
-  * cjpeg/djpeg, two sample applications that use the library to transform
-    JFIF JPEG files to and from several other image formats.
-cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
-command-line user interface and I/O routines for several uncompressed image
-formats.  This document concentrates on the library itself.
-
-We desire the library to be capable of supporting all JPEG baseline, extended
-sequential, and progressive DCT processes.  Hierarchical processes are not
-supported.
-
-The library does not support the lossless (spatial) JPEG process.  Lossless
-JPEG shares little or no code with lossy JPEG, and would normally be used
-without the extensive pre- and post-processing provided by this library.
-We feel that lossless JPEG is better handled by a separate library.
-
-Within these limits, any set of compression parameters allowed by the JPEG
-spec should be readable for decompression.  (We can be more restrictive about
-what formats we can generate.)  Although the system design allows for all
-parameter values, some uncommon settings are not yet implemented and may
-never be; nonintegral sampling ratios are the prime example.  Furthermore,
-we treat 8-bit vs. 12-bit data precision as a compile-time switch, not a
-run-time option, because most machines can store 8-bit pixels much more
-compactly than 12-bit.
-
-For legal reasons, JPEG arithmetic coding is not currently supported, but
-extending the library to include it would be straightforward.
-
-By itself, the library handles only interchange JPEG datastreams --- in
-particular the widely used JFIF file format.  The library can be used by
-surrounding code to process interchange or abbreviated JPEG datastreams that
-are embedded in more complex file formats.  (For example, libtiff uses this
-library to implement JPEG compression within the TIFF file format.)
-
-The library includes a substantial amount of code that is not covered by the
-JPEG standard but is necessary for typical applications of JPEG.  These
-functions preprocess the image before JPEG compression or postprocess it after
-decompression.  They include colorspace conversion, downsampling/upsampling,
-and color quantization.  This code can be omitted if not needed.
-
-A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
-and even more so in decompression postprocessing.  The decompression library
-provides multiple implementations that cover most of the useful tradeoffs,
-ranging from very-high-quality down to fast-preview operation.  On the
-compression side we have generally not provided low-quality choices, since
-compression is normally less time-critical.  It should be understood that the
-low-quality modes may not meet the JPEG standard's accuracy requirements;
-nonetheless, they are useful for viewers.
-
-
-*** Portability issues ***
-
-Portability is an essential requirement for the library.  The key portability
-issues that show up at the level of system architecture are:
-
-1.  Memory usage.  We want the code to be able to run on PC-class machines
-with limited memory.  Images should therefore be processed sequentially (in
-strips), to avoid holding the whole image in memory at once.  Where a
-full-image buffer is necessary, we should be able to use either virtual memory
-or temporary files.
-
-2.  Near/far pointer distinction.  To run efficiently on 80x86 machines, the
-code should distinguish "small" objects (kept in near data space) from
-"large" ones (kept in far data space).  This is an annoying restriction, but
-fortunately it does not impact code quality for less brain-damaged machines,
-and the source code clutter turns out to be minimal with sufficient use of
-pointer typedefs.
-
-3. Data precision.  We assume that "char" is at least 8 bits, "short" and
-"int" at least 16, "long" at least 32.  The code will work fine with larger
-data sizes, although memory may be used inefficiently in some cases.  However,
-the JPEG compressed datastream must ultimately appear on external storage as a
-sequence of 8-bit bytes if it is to conform to the standard.  This may pose a
-problem on machines where char is wider than 8 bits.  The library represents
-compressed data as an array of values of typedef JOCTET.  If no data type
-exactly 8 bits wide is available, custom data source and data destination
-modules must be written to unpack and pack the chosen JOCTET datatype into
-8-bit external representation.
-
-
-*** System overview ***
-
-The compressor and decompressor are each divided into two main sections:
-the JPEG compressor or decompressor proper, and the preprocessing or
-postprocessing functions.  The interface between these two sections is the
-image data that the official JPEG spec regards as its input or output: this
-data is in the colorspace to be used for compression, and it is downsampled
-to the sampling factors to be used.  The preprocessing and postprocessing
-steps are responsible for converting a normal image representation to or from
-this form.  (Those few applications that want to deal with YCbCr downsampled
-data can skip the preprocessing or postprocessing step.)
-
-Looking more closely, the compressor library contains the following main
-elements:
-
-  Preprocessing:
-    * Color space conversion (e.g., RGB to YCbCr).
-    * Edge expansion and downsampling.  Optionally, this step can do simple
-      smoothing --- this is often helpful for low-quality source data.
-  JPEG proper:
-    * MCU assembly, DCT, quantization.
-    * Entropy coding (sequential or progressive, Huffman or arithmetic).
-
-In addition to these modules we need overall control, marker generation,
-and support code (memory management & error handling).  There is also a
-module responsible for physically writing the output data --- typically
-this is just an interface to fwrite(), but some applications may need to
-do something else with the data.
-
-The decompressor library contains the following main elements:
-
-  JPEG proper:
-    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
-    * Dequantization, inverse DCT, MCU disassembly.
-  Postprocessing:
-    * Upsampling.  Optionally, this step may be able to do more general
-      rescaling of the image.
-    * Color space conversion (e.g., YCbCr to RGB).  This step may also
-      provide gamma adjustment [ currently it does not ].
-    * Optional color quantization (e.g., reduction to 256 colors).
-    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
-      [This feature is not currently implemented.]
-
-We also need overall control, marker parsing, and a data source module.
-The support code (memory management & error handling) can be shared with
-the compression half of the library.
-
-There may be several implementations of each of these elements, particularly
-in the decompressor, where a wide range of speed/quality tradeoffs is very
-useful.  It must be understood that some of the best speedups involve
-merging adjacent steps in the pipeline.  For example, upsampling, color space
-conversion, and color quantization might all be done at once when using a
-low-quality ordered-dither technique.  The system architecture is designed to
-allow such merging where appropriate.
-
-
-Note: it is convenient to regard edge expansion (padding to block boundaries)
-as a preprocessing/postprocessing function, even though the JPEG spec includes
-it in compression/decompression.  We do this because downsampling/upsampling
-can be simplified a little if they work on padded data: it's not necessary to
-have special cases at the right and bottom edges.  Therefore the interface
-buffer is always an integral number of blocks wide and high, and we expect
-compression preprocessing to pad the source data properly.  Padding will occur
-only to the next block (8-sample) boundary.  In an interleaved-scan situation,
-additional dummy blocks may be used to fill out MCUs, but the MCU assembly and
-disassembly logic will create or discard these blocks internally.  (This is
-advantageous for speed reasons, since we avoid DCTing the dummy blocks.
-It also permits a small reduction in file size, because the compressor can
-choose dummy block contents so as to minimize their size in compressed form.
-Finally, it makes the interface buffer specification independent of whether
-the file is actually interleaved or not.)  Applications that wish to deal
-directly with the downsampled data must provide similar buffering and padding
-for odd-sized images.
-
-
-*** Poor man's object-oriented programming ***
-
-It should be clear by now that we have a lot of quasi-independent processing
-steps, many of which have several possible behaviors.  To avoid cluttering the
-code with lots of switch statements, we use a simple form of object-style
-programming to separate out the different possibilities.
-
-For example, two different color quantization algorithms could be implemented
-as two separate modules that present the same external interface; at runtime,
-the calling code will access the proper module indirectly through an "object".
-
-We can get the limited features we need while staying within portable C.
-The basic tool is a function pointer.  An "object" is just a struct
-containing one or more function pointer fields, each of which corresponds to
-a method name in real object-oriented languages.  During initialization we
-fill in the function pointers with references to whichever module we have
-determined we need to use in this run.  Then invocation of the module is done
-by indirecting through a function pointer; on most machines this is no more
-expensive than a switch statement, which would be the only other way of
-making the required run-time choice.  The really significant benefit, of
-course, is keeping the source code clean and well structured.
-
-We can also arrange to have private storage that varies between different
-implementations of the same kind of object.  We do this by making all the
-module-specific object structs be separately allocated entities, which will
-be accessed via pointers in the master compression or decompression struct.
-The "public" fields or methods for a given kind of object are specified by
-a commonly known struct.  But a module's initialization code can allocate
-a larger struct that contains the common struct as its first member, plus
-additional private fields.  With appropriate pointer casting, the module's
-internal functions can access these private fields.  (For a simple example,
-see jdatadst.c, which implements the external interface specified by struct
-jpeg_destination_mgr, but adds extra fields.)
-
-(Of course this would all be a lot easier if we were using C++, but we are
-not yet prepared to assume that everyone has a C++ compiler.)
-
-An important benefit of this scheme is that it is easy to provide multiple
-versions of any method, each tuned to a particular case.  While a lot of
-precalculation might be done to select an optimal implementation of a method,
-the cost per invocation is constant.  For example, the upsampling step might
-have a "generic" method, plus one or more "hardwired" methods for the most
-popular sampling factors; the hardwired methods would be faster because they'd
-use straight-line code instead of for-loops.  The cost to determine which
-method to use is paid only once, at startup, and the selection criteria are
-hidden from the callers of the method.
-
-This plan differs a little bit from usual object-oriented structures, in that
-only one instance of each object class will exist during execution.  The
-reason for having the class structure is that on different runs we may create
-different instances (choose to execute different modules).  You can think of
-the term "method" as denoting the common interface presented by a particular
-set of interchangeable functions, and "object" as denoting a group of related
-methods, or the total shared interface behavior of a group of modules.
-
-
-*** Overall control structure ***
-
-We previously mentioned the need for overall control logic in the compression
-and decompression libraries.  In IJG implementations prior to v5, overall
-control was mostly provided by "pipeline control" modules, which proved to be
-large, unwieldy, and hard to understand.  To improve the situation, the
-control logic has been subdivided into multiple modules.  The control modules
-consist of:
-
-1. Master control for module selection and initialization.  This has two
-responsibilities:
-
-   1A.  Startup initialization at the beginning of image processing.
-        The individual processing modules to be used in this run are selected
-        and given initialization calls.
-
-   1B.  Per-pass control.  This determines how many passes will be performed
-        and calls each active processing module to configure itself
-        appropriately at the beginning of each pass.  End-of-pass processing,
-	where necessary, is also invoked from the master control module.
-
-   Method selection is partially distributed, in that a particular processing
-   module may contain several possible implementations of a particular method,
-   which it will select among when given its initialization call.  The master
-   control code need only be concerned with decisions that affect more than
-   one module.
- 
-2. Data buffering control.  A separate control module exists for each
-   inter-processing-step data buffer.  This module is responsible for
-   invoking the processing steps that write or read that data buffer.
-
-Each buffer controller sees the world as follows:
-
-input data => processing step A => buffer => processing step B => output data
-                      |              |               |
-              ------------------ controller ------------------
-
-The controller knows the dataflow requirements of steps A and B: how much data
-they want to accept in one chunk and how much they output in one chunk.  Its
-function is to manage its buffer and call A and B at the proper times.
-
-A data buffer control module may itself be viewed as a processing step by a
-higher-level control module; thus the control modules form a binary tree with
-elementary processing steps at the leaves of the tree.
-
-The control modules are objects.  A considerable amount of flexibility can
-be had by replacing implementations of a control module.  For example:
-* Merging of adjacent steps in the pipeline is done by replacing a control
-  module and its pair of processing-step modules with a single processing-
-  step module.  (Hence the possible merges are determined by the tree of
-  control modules.)
-* In some processing modes, a given interstep buffer need only be a "strip"
-  buffer large enough to accommodate the desired data chunk sizes.  In other
-  modes, a full-image buffer is needed and several passes are required.
-  The control module determines which kind of buffer is used and manipulates
-  virtual array buffers as needed.  One or both processing steps may be
-  unaware of the multi-pass behavior.
-
-In theory, we might be able to make all of the data buffer controllers
-interchangeable and provide just one set of implementations for all.  In
-practice, each one contains considerable special-case processing for its
-particular job.  The buffer controller concept should be regarded as an
-overall system structuring principle, not as a complete description of the
-task performed by any one controller.
-
-
-*** Compression object structure ***
-
-Here is a sketch of the logical structure of the JPEG compression library:
-
-                                                 |-- Colorspace conversion
-                  |-- Preprocessing controller --|
-                  |                              |-- Downsampling
-Main controller --|
-                  |                            |-- Forward DCT, quantize
-                  |-- Coefficient controller --|
-                                               |-- Entropy encoding
-
-This sketch also describes the flow of control (subroutine calls) during
-typical image data processing.  Each of the components shown in the diagram is
-an "object" which may have several different implementations available.  One
-or more source code files contain the actual implementation(s) of each object.
-
-The objects shown above are:
-
-* Main controller: buffer controller for the subsampled-data buffer, which
-  holds the preprocessed input data.  This controller invokes preprocessing to
-  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
-  usually no need for a full-image buffer here; a strip buffer is adequate.
-
-* Preprocessing controller: buffer controller for the downsampling input data
-  buffer, which lies between colorspace conversion and downsampling.  Note
-  that a unified conversion/downsampling module would probably replace this
-  controller entirely.
-
-* Colorspace conversion: converts application image data into the desired
-  JPEG color space; also changes the data from pixel-interleaved layout to
-  separate component planes.  Processes one pixel row at a time.
-
-* Downsampling: performs reduction of chroma components as required.
-  Optionally may perform pixel-level smoothing as well.  Processes a "row
-  group" at a time, where a row group is defined as Vmax pixel rows of each
-  component before downsampling, and Vk sample rows afterwards (remember Vk
-  differs across components).  Some downsampling or smoothing algorithms may
-  require context rows above and below the current row group; the
-  preprocessing controller is responsible for supplying these rows via proper
-  buffering.  The downsampler is responsible for edge expansion at the right
-  edge (i.e., extending each sample row to a multiple of 8 samples); but the
-  preprocessing controller is responsible for vertical edge expansion (i.e.,
-  duplicating the bottom sample row as needed to make a multiple of 8 rows).
-
-* Coefficient controller: buffer controller for the DCT-coefficient data.
-  This controller handles MCU assembly, including insertion of dummy DCT
-  blocks when needed at the right or bottom edge.  When performing
-  Huffman-code optimization or emitting a multiscan JPEG file, this
-  controller is responsible for buffering the full image.  The equivalent of
-  one fully interleaved MCU row of subsampled data is processed per call,
-  even when the JPEG file is noninterleaved.
-
-* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
-  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
-  emitted in normal array order, which the entropy encoder is expected to
-  convert to zigzag order as necessary.  Prior versions of the IJG code did
-  the conversion to zigzag order within the quantization step.)
-
-* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
-  coded data to the data destination module.  Works on one MCU per call.
-  For progressive JPEG, the same DCT blocks are fed to the entropy coder
-  during each pass, and the coder must emit the appropriate subset of
-  coefficients.
-
-In addition to the above objects, the compression library includes these
-objects:
-
-* Master control: determines the number of passes required, controls overall
-  and per-pass initialization of the other modules.
-
-* Marker writing: generates JPEG markers (except for RSTn, which is emitted
-  by the entropy encoder when needed).
-
-* Data destination manager: writes the output JPEG datastream to its final
-  destination (e.g., a file).  The destination manager supplied with the
-  library knows how to write to a stdio stream; for other behaviors, the
-  surrounding application may provide its own destination manager.
-
-* Memory manager: allocates and releases memory, controls virtual arrays
-  (with backing store management, where required).
-
-* Error handler: performs formatting and output of error and trace messages;
-  determines handling of nonfatal errors.  The surrounding application may
-  override some or all of this object's methods to change error handling.
-
-* Progress monitor: supports output of "percent-done" progress reports.
-  This object represents an optional callback to the surrounding application:
-  if wanted, it must be supplied by the application.
-
-The error handler, destination manager, and progress monitor objects are
-defined as separate objects in order to simplify application-specific
-customization of the JPEG library.  A surrounding application may override
-individual methods or supply its own all-new implementation of one of these
-objects.  The object interfaces for these objects are therefore treated as
-part of the application interface of the library, whereas the other objects
-are internal to the library.
-
-The error handler and memory manager are shared by JPEG compression and
-decompression; the progress monitor, if used, may be shared as well.
-
-
-*** Decompression object structure ***
-
-Here is a sketch of the logical structure of the JPEG decompression library:
-
-                                               |-- Entropy decoding
-                  |-- Coefficient controller --|
-                  |                            |-- Dequantize, Inverse DCT
-Main controller --|
-                  |                               |-- Upsampling
-                  |-- Postprocessing controller --|   |-- Colorspace conversion
-                                                  |-- Color quantization
-                                                  |-- Color precision reduction
-
-As before, this diagram also represents typical control flow.  The objects
-shown are:
-
-* Main controller: buffer controller for the subsampled-data buffer, which
-  holds the output of JPEG decompression proper.  This controller's primary
-  task is to feed the postprocessing procedure.  Some upsampling algorithms
-  may require context rows above and below the current row group; when this
-  is true, the main controller is responsible for managing its buffer so as
-  to make context rows available.  In the current design, the main buffer is
-  always a strip buffer; a full-image buffer is never required.
-
-* Coefficient controller: buffer controller for the DCT-coefficient data.
-  This controller handles MCU disassembly, including deletion of any dummy
-  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
-  file, this controller is responsible for buffering the full image.
-  (Buffering DCT coefficients, rather than samples, is necessary to support
-  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
-  subsampled data is processed per call, even when the source JPEG file is
-  noninterleaved.
-
-* Entropy decoding: Read coded data from the data source module and perform
-  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
-  For progressive JPEG decoding, the coefficient controller supplies the prior
-  coefficients of each MCU (initially all zeroes), which the entropy decoder
-  modifies in each scan.
-
-* Dequantization and inverse DCT: like it says.  Note that the coefficients
-  buffered by the coefficient controller have NOT been dequantized; we
-  merge dequantization and inverse DCT into a single step for speed reasons.
-  When scaled-down output is asked for, simplified DCT algorithms may be used
-  that emit only 1x1, 2x2, or 4x4 samples per DCT block, not the full 8x8.
-  Works on one DCT block at a time.
-
-* Postprocessing controller: buffer controller for the color quantization
-  input buffer, when quantization is in use.  (Without quantization, this
-  controller just calls the upsampler.)  For two-pass quantization, this
-  controller is responsible for buffering the full-image data.
-
-* Upsampling: restores chroma components to full size.  (May support more
-  general output rescaling, too.  Note that if undersized DCT outputs have
-  been emitted by the DCT module, this module must adjust so that properly
-  sized outputs are created.)  Works on one row group at a time.  This module
-  also calls the color conversion module, so its top level is effectively a
-  buffer controller for the upsampling->color conversion buffer.  However, in
-  all but the highest-quality operating modes, upsampling and color
-  conversion are likely to be merged into a single step.
-
-* Colorspace conversion: convert from JPEG color space to output color space,
-  and change data layout from separate component planes to pixel-interleaved.
-  Works on one pixel row at a time.
-
-* Color quantization: reduce the data to colormapped form, using either an
-  externally specified colormap or an internally generated one.  This module
-  is not used for full-color output.  Works on one pixel row at a time; may
-  require two passes to generate a color map.  Note that the output will
-  always be a single component representing colormap indexes.  In the current
-  design, the output values are JSAMPLEs, so an 8-bit compilation cannot
-  quantize to more than 256 colors.  This is unlikely to be a problem in
-  practice.
-
-* Color reduction: this module handles color precision reduction, e.g.,
-  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
-  Not quite clear yet how this should be handled... should we merge it with
-  colorspace conversion???
-
-Note that some high-speed operating modes might condense the entire
-postprocessing sequence to a single module (upsample, color convert, and
-quantize in one step).
-
-In addition to the above objects, the decompression library includes these
-objects:
-
-* Master control: determines the number of passes required, controls overall
-  and per-pass initialization of the other modules.  This is subdivided into
-  input and output control: jdinput.c controls only input-side processing,
-  while jdmaster.c handles overall initialization and output-side control.
-
-* Marker reading: decodes JPEG markers (except for RSTn).
-
-* Data source manager: supplies the input JPEG datastream.  The source
-  manager supplied with the library knows how to read from a stdio stream;
-  for other behaviors, the surrounding application may provide its own source
-  manager.
-
-* Memory manager: same as for compression library.
-
-* Error handler: same as for compression library.
-
-* Progress monitor: same as for compression library.
-
-As with compression, the data source manager, error handler, and progress
-monitor are candidates for replacement by a surrounding application.
-
-
-*** Decompression input and output separation ***
-
-To support efficient incremental display of progressive JPEG files, the
-decompressor is divided into two sections that can run independently:
-
-1. Data input includes marker parsing, entropy decoding, and input into the
-   coefficient controller's DCT coefficient buffer.  Note that this
-   processing is relatively cheap and fast.
-
-2. Data output reads from the DCT coefficient buffer and performs the IDCT
-   and all postprocessing steps.
-
-For a progressive JPEG file, the data input processing is allowed to get
-arbitrarily far ahead of the data output processing.  (This occurs only
-if the application calls jpeg_consume_input(); otherwise input and output
-run in lockstep, since the input section is called only when the output
-section needs more data.)  In this way the application can avoid making
-extra display passes when data is arriving faster than the display pass
-can run.  Furthermore, it is possible to abort an output pass without
-losing anything, since the coefficient buffer is read-only as far as the
-output section is concerned.  See libjpeg.doc for more detail.
-
-A full-image coefficient array is only created if the JPEG file has multiple
-scans (or if the application specifies buffered-image mode anyway).  When
-reading a single-scan file, the coefficient controller normally creates only
-a one-MCU buffer, so input and output processing must run in lockstep in this
-case.  jpeg_consume_input() is effectively a no-op in this situation.
-
-The main impact of dividing the decompressor in this fashion is that we must
-be very careful with shared variables in the cinfo data structure.  Each
-variable that can change during the course of decompression must be
-classified as belonging to data input or data output, and each section must
-look only at its own variables.  For example, the data output section may not
-depend on any of the variables that describe the current scan in the JPEG
-file, because these may change as the data input section advances into a new
-scan.
-
-The progress monitor is (somewhat arbitrarily) defined to treat input of the
-file as one pass when buffered-image mode is not used, and to ignore data
-input work completely when buffered-image mode is used.  Note that the
-library has no reliable way to predict the number of passes when dealing
-with a progressive JPEG file, nor can it predict the number of output passes
-in buffered-image mode.  So the work estimate is inherently bogus anyway.
-
-No comparable division is currently made in the compression library, because
-there isn't any real need for it.
-
-
-*** Data formats ***
-
-Arrays of pixel sample values use the following data structure:
-
-    typedef something JSAMPLE;		a pixel component value, 0..MAXJSAMPLE
-    typedef JSAMPLE *JSAMPROW;		ptr to a row of samples
-    typedef JSAMPROW *JSAMPARRAY;	ptr to a list of rows
-    typedef JSAMPARRAY *JSAMPIMAGE;	ptr to a list of color-component arrays
-
-The basic element type JSAMPLE will typically be one of unsigned char,
-(signed) char, or short.  Short will be used if samples wider than 8 bits are
-to be supported (this is a compile-time option).  Otherwise, unsigned char is
-used if possible.  If the compiler only supports signed chars, then it is
-necessary to mask off the value when reading.  Thus, all reads of JSAMPLE
-values must be coded as "GETJSAMPLE(value)", where the macro will be defined
-as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
-
-With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
-simplify correct rounding during downsampling, etc.  The JPEG standard's
-specification that sample values run from -128..127 is accommodated by
-subtracting 128 just as the sample value is copied into the source array for
-the DCT step (this will be an array of signed ints).  Similarly, during
-decompression the output of the IDCT step will be immediately shifted back to
-0..255.  (NB: different values are required when 12-bit samples are in use.
-The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
-defined as 255 and 128 respectively in an 8-bit implementation, and as 4095
-and 2048 in a 12-bit implementation.)
-
-We use a pointer per row, rather than a two-dimensional JSAMPLE array.  This
-choice costs only a small amount of memory and has several benefits:
-* Code using the data structure doesn't need to know the allocated width of
-  the rows.  This simplifies edge expansion/compression, since we can work
-  in an array that's wider than the logical picture width.
-* Indexing doesn't require multiplication; this is a performance win on many
-  machines.
-* Arrays with more than 64K total elements can be supported even on machines
-  where malloc() cannot allocate chunks larger than 64K.
-* The rows forming a component array may be allocated at different times
-  without extra copying.  This trick allows some speedups in smoothing steps
-  that need access to the previous and next rows.
-
-Note that each color component is stored in a separate array; we don't use the
-traditional layout in which the components of a pixel are stored together.
-This simplifies coding of modules that work on each component independently,
-because they don't need to know how many components there are.  Furthermore,
-we can read or write each component to a temporary file independently, which
-is helpful when dealing with noninterleaved JPEG files.
-
-In general, a specific sample value is accessed by code such as
-	GETJSAMPLE(image[colorcomponent][row][col])
-where col is measured from the image left edge, but row is measured from the
-first sample row currently in memory.  Either of the first two indexings can
-be precomputed by copying the relevant pointer.
-
-
-Since most image-processing applications prefer to work on images in which
-the components of a pixel are stored together, the data passed to or from the
-surrounding application uses the traditional convention: a single pixel is
-represented by N consecutive JSAMPLE values, and an image row is an array of
-(# of color components)*(image width) JSAMPLEs.  One or more rows of data can
-be represented by a pointer of type JSAMPARRAY in this scheme.  This scheme is
-converted to component-wise storage inside the JPEG library.  (Applications
-that want to skip JPEG preprocessing or postprocessing will have to contend
-with component-wise storage.)
-
-
-Arrays of DCT-coefficient values use the following data structure:
-
-    typedef short JCOEF;		a 16-bit signed integer
-    typedef JCOEF JBLOCK[DCTSIZE2];	an 8x8 block of coefficients
-    typedef JBLOCK *JBLOCKROW;		ptr to one horizontal row of 8x8 blocks
-    typedef JBLOCKROW *JBLOCKARRAY;	ptr to a list of such rows
-    typedef JBLOCKARRAY *JBLOCKIMAGE;	ptr to a list of color component arrays
-
-The underlying type is at least a 16-bit signed integer; while "short" is big
-enough on all machines of interest, on some machines it is preferable to use
-"int" for speed reasons, despite the storage cost.  Coefficients are grouped
-into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
-"8" and "64").
-
-The contents of a coefficient block may be in either "natural" or zigzagged
-order, and may be true values or divided by the quantization coefficients,
-depending on where the block is in the processing pipeline.  In the current
-library, coefficient blocks are kept in natural order everywhere; the entropy
-codecs zigzag or dezigzag the data as it is written or read.  The blocks
-contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
-(This latter decision may need to be revisited to support variable
-quantization a la JPEG Part 3.)
-
-Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
-eight rows of samples.  Otherwise the structure is much the same as for
-samples, and for the same reasons.
-
-On machines where malloc() can't handle a request bigger than 64Kb, this data
-structure limits us to rows of less than 512 JBLOCKs, or a picture width of
-4000+ pixels.  This seems an acceptable restriction.
-
-
-On 80x86 machines, the bottom-level pointer types (JSAMPROW and JBLOCKROW)
-must be declared as "far" pointers, but the upper levels can be "near"
-(implying that the pointer lists are allocated in the DS segment).
-We use a #define symbol FAR, which expands to the "far" keyword when
-compiling on 80x86 machines and to nothing elsewhere.
-
-
-*** Suspendable processing ***
-
-In some applications it is desirable to use the JPEG library as an
-incremental, memory-to-memory filter.  In this situation the data source or
-destination may be a limited-size buffer, and we can't rely on being able to
-empty or refill the buffer at arbitrary times.  Instead the application would
-like to have control return from the library at buffer overflow/underrun, and
-then resume compression or decompression at a later time.
-
-This scenario is supported for simple cases.  (For anything more complex, we
-recommend that the application "bite the bullet" and develop real multitasking
-capability.)  The libjpeg.doc file goes into more detail about the usage and
-limitations of this capability; here we address the implications for library
-structure.
-
-The essence of the problem is that the entropy codec (coder or decoder) must
-be prepared to stop at arbitrary times.  In turn, the controllers that call
-the entropy codec must be able to stop before having produced or consumed all
-the data that they normally would handle in one call.  That part is reasonably
-straightforward: we make the controller call interfaces include "progress
-counters" which indicate the number of data chunks successfully processed, and
-we require callers to test the counter rather than just assume all of the data
-was processed.
-
-Rather than trying to restart at an arbitrary point, the current Huffman
-codecs are designed to restart at the beginning of the current MCU after a
-suspension due to buffer overflow/underrun.  At the start of each call, the
-codec's internal state is loaded from permanent storage (in the JPEG object
-structures) into local variables.  On successful completion of the MCU, the
-permanent state is updated.  (This copying is not very expensive, and may even
-lead to *improved* performance if the local variables can be registerized.)
-If a suspension occurs, the codec simply returns without updating the state,
-thus effectively reverting to the start of the MCU.  Note that this implies
-leaving some data unprocessed in the source/destination buffer (ie, the
-compressed partial MCU).  The data source/destination module interfaces are
-specified so as to make this possible.  This also implies that the data buffer
-must be large enough to hold a worst-case compressed MCU; a couple thousand
-bytes should be enough.
-
-In a successive-approximation AC refinement scan, the progressive Huffman
-decoder has to be able to undo assignments of newly nonzero coefficients if it
-suspends before the MCU is complete, since decoding requires distinguishing
-previously-zero and previously-nonzero coefficients.  This is a bit tedious
-but probably won't have much effect on performance.  Other variants of Huffman
-decoding need not worry about this, since they will just store the same values
-again if forced to repeat the MCU.
-
-This approach would probably not work for an arithmetic codec, since its
-modifiable state is quite large and couldn't be copied cheaply.  Instead it
-would have to suspend and resume exactly at the point of the buffer end.
-
-The JPEG marker reader is designed to cope with suspension at an arbitrary
-point.  It does so by backing up to the start of the marker parameter segment,
-so the data buffer must be big enough to hold the largest marker of interest.
-Again, a couple KB should be adequate.  (A special "skip" convention is used
-to bypass COM and APPn markers, so these can be larger than the buffer size
-without causing problems; otherwise a 64K buffer would be needed in the worst
-case.)
-
-The JPEG marker writer currently does *not* cope with suspension.  I feel that
-this is not necessary; it is much easier simply to require the application to
-ensure there is enough buffer space before starting.  (An empty 2K buffer is
-more than sufficient for the header markers; and ensuring there are a dozen or
-two bytes available before calling jpeg_finish_compress() will suffice for the
-trailer.)  This would not work for writing multi-scan JPEG files, but
-we simply do not intend to support that capability with suspension.
-
-
-*** Memory manager services ***
-
-The JPEG library's memory manager controls allocation and deallocation of
-memory, and it manages large "virtual" data arrays on machines where the
-operating system does not provide virtual memory.  Note that the same
-memory manager serves both compression and decompression operations.
-
-In all cases, allocated objects are tied to a particular compression or
-decompression master record, and they will be released when that master
-record is destroyed.
-
-The memory manager does not provide explicit deallocation of objects.
-Instead, objects are created in "pools" of free storage, and a whole pool
-can be freed at once.  This approach helps prevent storage-leak bugs, and
-it speeds up operations whenever malloc/free are slow (as they often are).
-The pools can be regarded as lifetime identifiers for objects.  Two
-pools/lifetimes are defined:
-  * JPOOL_PERMANENT	lasts until master record is destroyed
-  * JPOOL_IMAGE		lasts until done with image (JPEG datastream)
-Permanent lifetime is used for parameters and tables that should be carried
-across from one datastream to another; this includes all application-visible
-parameters.  Image lifetime is used for everything else.  (A third lifetime,
-JPOOL_PASS = one processing pass, was originally planned.  However it was
-dropped as not being worthwhile.  The actual usage patterns are such that the
-peak memory usage would be about the same anyway; and having per-pass storage
-substantially complicates the virtual memory allocation rules --- see below.)
-
-The memory manager deals with three kinds of object:
-1. "Small" objects.  Typically these require no more than 10K-20K total.
-2. "Large" objects.  These may require tens to hundreds of K depending on
-   image size.  Semantically they behave the same as small objects, but we
-   distinguish them for two reasons:
-     * On MS-DOS machines, large objects are referenced by FAR pointers,
-       small objects by NEAR pointers.
-     * Pool allocation heuristics may differ for large and small objects.
-   Note that individual "large" objects cannot exceed the size allowed by
-   type size_t, which may be 64K or less on some machines.
-3. "Virtual" objects.  These are large 2-D arrays of JSAMPLEs or JBLOCKs
-   (typically large enough for the entire image being processed).  The
-   memory manager provides stripwise access to these arrays.  On machines
-   without virtual memory, the rest of the array may be swapped out to a
-   temporary file.
-
-(Note: JSAMPARRAY and JBLOCKARRAY data structures are a combination of large
-objects for the data proper and small objects for the row pointers.  For
-convenience and speed, the memory manager provides single routines to create
-these structures.  Similarly, virtual arrays include a small control block
-and a JSAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
-
-In the present implementation, virtual arrays are only permitted to have image
-lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
-not very useful since a virtual array's raison d'etre is to store data for
-multiple passes through the image.)  We also expect that only "small" objects
-will be given permanent lifespan, though this restriction is not required by
-the memory manager.
-
-In a non-virtual-memory machine, some performance benefit can be gained by
-making the in-memory buffers for virtual arrays be as large as possible.
-(For small images, the buffers might fit entirely in memory, so blind
-swapping would be very wasteful.)  The memory manager will adjust the height
-of the buffers to fit within a prespecified maximum memory usage.  In order
-to do this in a reasonably optimal fashion, the manager needs to allocate all
-of the virtual arrays at once.  Therefore, there isn't a one-step allocation
-routine for virtual arrays; instead, there is a "request" routine that simply
-allocates the control block, and a "realize" routine (called just once) that
-determines space allocation and creates all of the actual buffers.  The
-realize routine must allow for space occupied by non-virtual large objects.
-(We don't bother to factor in the space needed for small objects, on the
-grounds that it isn't worth the trouble.)
-
-To support all this, we establish the following protocol for doing business
-with the memory manager:
-  1. Modules must request virtual arrays (which may have only image lifespan)
-     during the initial setup phase, i.e., in their jinit_xxx routines.
-  2. All "large" objects (including JSAMPARRAYs and JBLOCKARRAYs) must also be
-     allocated during initial setup.
-  3. realize_virt_arrays will be called at the completion of initial setup.
-     The above conventions ensure that sufficient information is available
-     for it to choose a good size for virtual array buffers.
-Small objects of any lifespan may be allocated at any time.  We expect that
-the total space used for small objects will be small enough to be negligible
-in the realize_virt_arrays computation.
-
-In a virtual-memory machine, we simply pretend that the available space is
-infinite, thus causing realize_virt_arrays to decide that it can allocate all
-the virtual arrays as full-size in-memory buffers.  The overhead of the
-virtual-array access protocol is very small when no swapping occurs.
-
-A virtual array can be specified to be "pre-zeroed"; when this flag is set,
-never-yet-written sections of the array are set to zero before being made
-available to the caller.  If this flag is not set, never-written sections
-of the array contain garbage.  (This feature exists primarily because the
-equivalent logic would otherwise be needed in jdcoefct.c for progressive
-JPEG mode; we may as well make it available for possible other uses.)
-
-The first write pass on a virtual array is required to occur in top-to-bottom
-order; read passes, as well as any write passes after the first one, may
-access the array in any order.  This restriction exists partly to simplify
-the virtual array control logic, and partly because some file systems may not
-support seeking beyond the current end-of-file in a temporary file.  The main
-implication of this restriction is that rearrangement of rows (such as
-converting top-to-bottom data order to bottom-to-top) must be handled while
-reading data out of the virtual array, not while putting it in.
-
-
-*** Memory manager internal structure ***
-
-To isolate system dependencies as much as possible, we have broken the
-memory manager into two parts.  There is a reasonably system-independent
-"front end" (jmemmgr.c) and a "back end" that contains only the code
-likely to change across systems.  All of the memory management methods
-outlined above are implemented by the front end.  The back end provides
-the following routines for use by the front end (none of these routines
-are known to the rest of the JPEG code):
-
-jpeg_mem_init, jpeg_mem_term	system-dependent initialization/shutdown
-
-jpeg_get_small, jpeg_free_small	interface to malloc and free library routines
-				(or their equivalents)
-
-jpeg_get_large, jpeg_free_large	interface to FAR malloc/free in MSDOS machines;
-				else usually the same as
-				jpeg_get_small/jpeg_free_small
-
-jpeg_mem_available		estimate available memory
-
-jpeg_open_backing_store		create a backing-store object
-
-read_backing_store,		manipulate a backing-store object
-write_backing_store,
-close_backing_store
-
-On some systems there will be more than one type of backing-store object
-(specifically, in MS-DOS a backing store file might be an area of extended
-memory as well as a disk file).  jpeg_open_backing_store is responsible for
-choosing how to implement a given object.  The read/write/close routines
-are method pointers in the structure that describes a given object; this
-lets them be different for different object types.
-
-It may be necessary to ensure that backing store objects are explicitly
-released upon abnormal program termination.  For example, MS-DOS won't free
-extended memory by itself.  To support this, we will expect the main program
-or surrounding application to arrange to call self_destruct (typically via
-jpeg_destroy) upon abnormal termination.  This may require a SIGINT signal
-handler or equivalent.  We don't want to have the back end module install its
-own signal handler, because that would pre-empt the surrounding application's
-ability to control signal handling.
-
-The IJG distribution includes several memory manager back end implementations.
-Usually the same back end should be suitable for all applications on a given
-system, but it is possible for an application to supply its own back end at
-need.
-
-
-*** Implications of DNL marker ***
-
-Some JPEG files may use a DNL marker to postpone definition of the image
-height (this would be useful for a fax-like scanner's output, for instance).
-In these files the SOF marker claims the image height is 0, and you only
-find out the true image height at the end of the first scan.
-
-We could read these files as follows:
-1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
-2. When the DNL is found, update the image height in the global image
-   descriptor.
-This implies that control modules must avoid making copies of the image
-height, and must re-test for termination after each MCU row.  This would
-be easy enough to do.
-
-In cases where image-size data structures are allocated, this approach will
-result in very inefficient use of virtual memory or much-larger-than-necessary
-temporary files.  This seems acceptable for something that probably won't be a
-mainstream usage.  People might have to forgo use of memory-hogging options
-(such as two-pass color quantization or noninterleaved JPEG files) if they
-want efficient conversion of such files.  (One could improve efficiency by
-demanding a user-supplied upper bound for the height, less than 65536; in most
-cases it could be much less.)
-
-The standard also permits the SOF marker to overestimate the image height,
-with a DNL to give the true, smaller height at the end of the first scan.
-This would solve the space problems if the overestimate wasn't too great.
-However, it implies that you don't even know whether DNL will be used.
-
-This leads to a couple of very serious objections:
-1. Testing for a DNL marker must occur in the inner loop of the decompressor's
-   Huffman decoder; this implies a speed penalty whether the feature is used
-   or not.
-2. There is no way to hide the last-minute change in image height from an
-   application using the decoder.  Thus *every* application using the IJG
-   library would suffer a complexity penalty whether it cared about DNL or
-   not.
-We currently do not support DNL because of these problems.
-
-A different approach is to insist that DNL-using files be preprocessed by a
-separate program that reads ahead to the DNL, then goes back and fixes the SOF
-marker.  This is a much simpler solution and is probably far more efficient.
-Even if one wants piped input, buffering the first scan of the JPEG file needs
-a lot smaller temp file than is implied by the maximum-height method.  For
-this approach we'd simply treat DNL as a no-op in the decompressor (at most,
-check that it matches the SOF image height).
-
-We will not worry about making the compressor capable of outputting DNL.
-Something similar to the first scheme above could be applied if anyone ever
-wants to make that work.
diff --git a/jpeg/structure.txt b/jpeg/structure.txt
new file mode 100644
index 000000000..fe88701e3
--- /dev/null
+++ b/jpeg/structure.txt
@@ -0,0 +1,945 @@
+IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
+
+Copyright (C) 1991-2009, Thomas G. Lane, Guido Vollbeding.
+This file is part of the Independent JPEG Group's software.
+For conditions of distribution and use, see the accompanying README file.
+
+
+This file provides an overview of the architecture of the IJG JPEG software;
+that is, the functions of the various modules in the system and the interfaces
+between modules.  For more precise details about any data structure or calling
+convention, see the include files and comments in the source code.
+
+We assume that the reader is already somewhat familiar with the JPEG standard.
+The README file includes references for learning about JPEG.  The file
+libjpeg.txt describes the library from the viewpoint of an application
+programmer using the library; it's best to read that file before this one.
+Also, the file coderules.txt describes the coding style conventions we use.
+
+In this document, JPEG-specific terminology follows the JPEG standard:
+  A "component" means a color channel, e.g., Red or Luminance.
+  A "sample" is a single component value (i.e., one number in the image data).
+  A "coefficient" is a frequency coefficient (a DCT transform output number).
+  A "block" is an 8x8 group of samples or coefficients.
+  An "MCU" (minimum coded unit) is an interleaved set of blocks of size
+	determined by the sampling factors, or a single block in a
+	noninterleaved scan.
+We do not use the terms "pixel" and "sample" interchangeably.  When we say
+pixel, we mean an element of the full-size image, while a sample is an element
+of the downsampled image.  Thus the number of samples may vary across
+components while the number of pixels does not.  (This terminology is not used
+rigorously throughout the code, but it is used in places where confusion would
+otherwise result.)
+
+
+*** System features ***
+
+The IJG distribution contains two parts:
+  * A subroutine library for JPEG compression and decompression.
+  * cjpeg/djpeg, two sample applications that use the library to transform
+    JFIF JPEG files to and from several other image formats.
+cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
+command-line user interface and I/O routines for several uncompressed image
+formats.  This document concentrates on the library itself.
+
+We desire the library to be capable of supporting all JPEG baseline, extended
+sequential, and progressive DCT processes.  Hierarchical processes are not
+supported.
+
+The library does not support the lossless (spatial) JPEG process.  Lossless
+JPEG shares little or no code with lossy JPEG, and would normally be used
+without the extensive pre- and post-processing provided by this library.
+We feel that lossless JPEG is better handled by a separate library.
+
+Within these limits, any set of compression parameters allowed by the JPEG
+spec should be readable for decompression.  (We can be more restrictive about
+what formats we can generate.)  Although the system design allows for all
+parameter values, some uncommon settings are not yet implemented and may
+never be; nonintegral sampling ratios are the prime example.  Furthermore,
+we treat 8-bit vs. 12-bit data precision as a compile-time switch, not a
+run-time option, because most machines can store 8-bit pixels much more
+compactly than 12-bit.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, libtiff uses this
+library to implement JPEG compression within the TIFF file format.)
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  This code can be omitted if not needed.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+
+*** Portability issues ***
+
+Portability is an essential requirement for the library.  The key portability
+issues that show up at the level of system architecture are:
+
+1.  Memory usage.  We want the code to be able to run on PC-class machines
+with limited memory.  Images should therefore be processed sequentially (in
+strips), to avoid holding the whole image in memory at once.  Where a
+full-image buffer is necessary, we should be able to use either virtual memory
+or temporary files.
+
+2.  Near/far pointer distinction.  To run efficiently on 80x86 machines, the
+code should distinguish "small" objects (kept in near data space) from
+"large" ones (kept in far data space).  This is an annoying restriction, but
+fortunately it does not impact code quality for less brain-damaged machines,
+and the source code clutter turns out to be minimal with sufficient use of
+pointer typedefs.
+
+3. Data precision.  We assume that "char" is at least 8 bits, "short" and
+"int" at least 16, "long" at least 32.  The code will work fine with larger
+data sizes, although memory may be used inefficiently in some cases.  However,
+the JPEG compressed datastream must ultimately appear on external storage as a
+sequence of 8-bit bytes if it is to conform to the standard.  This may pose a
+problem on machines where char is wider than 8 bits.  The library represents
+compressed data as an array of values of typedef JOCTET.  If no data type
+exactly 8 bits wide is available, custom data source and data destination
+modules must be written to unpack and pack the chosen JOCTET datatype into
+8-bit external representation.
+
+
+*** System overview ***
+
+The compressor and decompressor are each divided into two main sections:
+the JPEG compressor or decompressor proper, and the preprocessing or
+postprocessing functions.  The interface between these two sections is the
+image data that the official JPEG spec regards as its input or output: this
+data is in the colorspace to be used for compression, and it is downsampled
+to the sampling factors to be used.  The preprocessing and postprocessing
+steps are responsible for converting a normal image representation to or from
+this form.  (Those few applications that want to deal with YCbCr downsampled
+data can skip the preprocessing or postprocessing step.)
+
+Looking more closely, the compressor library contains the following main
+elements:
+
+  Preprocessing:
+    * Color space conversion (e.g., RGB to YCbCr).
+    * Edge expansion and downsampling.  Optionally, this step can do simple
+      smoothing --- this is often helpful for low-quality source data.
+  JPEG proper:
+    * MCU assembly, DCT, quantization.
+    * Entropy coding (sequential or progressive, Huffman or arithmetic).
+
+In addition to these modules we need overall control, marker generation,
+and support code (memory management & error handling).  There is also a
+module responsible for physically writing the output data --- typically
+this is just an interface to fwrite(), but some applications may need to
+do something else with the data.
+
+The decompressor library contains the following main elements:
+
+  JPEG proper:
+    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
+    * Dequantization, inverse DCT, MCU disassembly.
+  Postprocessing:
+    * Upsampling.  Optionally, this step may be able to do more general
+      rescaling of the image.
+    * Color space conversion (e.g., YCbCr to RGB).  This step may also
+      provide gamma adjustment [ currently it does not ].
+    * Optional color quantization (e.g., reduction to 256 colors).
+    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
+      [This feature is not currently implemented.]
+
+We also need overall control, marker parsing, and a data source module.
+The support code (memory management & error handling) can be shared with
+the compression half of the library.
+
+There may be several implementations of each of these elements, particularly
+in the decompressor, where a wide range of speed/quality tradeoffs is very
+useful.  It must be understood that some of the best speedups involve
+merging adjacent steps in the pipeline.  For example, upsampling, color space
+conversion, and color quantization might all be done at once when using a
+low-quality ordered-dither technique.  The system architecture is designed to
+allow such merging where appropriate.
+
+
+Note: it is convenient to regard edge expansion (padding to block boundaries)
+as a preprocessing/postprocessing function, even though the JPEG spec includes
+it in compression/decompression.  We do this because downsampling/upsampling
+can be simplified a little if they work on padded data: it's not necessary to
+have special cases at the right and bottom edges.  Therefore the interface
+buffer is always an integral number of blocks wide and high, and we expect
+compression preprocessing to pad the source data properly.  Padding will occur
+only to the next block (8-sample) boundary.  In an interleaved-scan situation,
+additional dummy blocks may be used to fill out MCUs, but the MCU assembly and
+disassembly logic will create or discard these blocks internally.  (This is
+advantageous for speed reasons, since we avoid DCTing the dummy blocks.
+It also permits a small reduction in file size, because the compressor can
+choose dummy block contents so as to minimize their size in compressed form.
+Finally, it makes the interface buffer specification independent of whether
+the file is actually interleaved or not.)  Applications that wish to deal
+directly with the downsampled data must provide similar buffering and padding
+for odd-sized images.
+
+
+*** Poor man's object-oriented programming ***
+
+It should be clear by now that we have a lot of quasi-independent processing
+steps, many of which have several possible behaviors.  To avoid cluttering the
+code with lots of switch statements, we use a simple form of object-style
+programming to separate out the different possibilities.
+
+For example, two different color quantization algorithms could be implemented
+as two separate modules that present the same external interface; at runtime,
+the calling code will access the proper module indirectly through an "object".
+
+We can get the limited features we need while staying within portable C.
+The basic tool is a function pointer.  An "object" is just a struct
+containing one or more function pointer fields, each of which corresponds to
+a method name in real object-oriented languages.  During initialization we
+fill in the function pointers with references to whichever module we have
+determined we need to use in this run.  Then invocation of the module is done
+by indirecting through a function pointer; on most machines this is no more
+expensive than a switch statement, which would be the only other way of
+making the required run-time choice.  The really significant benefit, of
+course, is keeping the source code clean and well structured.
+
+We can also arrange to have private storage that varies between different
+implementations of the same kind of object.  We do this by making all the
+module-specific object structs be separately allocated entities, which will
+be accessed via pointers in the master compression or decompression struct.
+The "public" fields or methods for a given kind of object are specified by
+a commonly known struct.  But a module's initialization code can allocate
+a larger struct that contains the common struct as its first member, plus
+additional private fields.  With appropriate pointer casting, the module's
+internal functions can access these private fields.  (For a simple example,
+see jdatadst.c, which implements the external interface specified by struct
+jpeg_destination_mgr, but adds extra fields.)
+
+(Of course this would all be a lot easier if we were using C++, but we are
+not yet prepared to assume that everyone has a C++ compiler.)
+
+An important benefit of this scheme is that it is easy to provide multiple
+versions of any method, each tuned to a particular case.  While a lot of
+precalculation might be done to select an optimal implementation of a method,
+the cost per invocation is constant.  For example, the upsampling step might
+have a "generic" method, plus one or more "hardwired" methods for the most
+popular sampling factors; the hardwired methods would be faster because they'd
+use straight-line code instead of for-loops.  The cost to determine which
+method to use is paid only once, at startup, and the selection criteria are
+hidden from the callers of the method.
+
+This plan differs a little bit from usual object-oriented structures, in that
+only one instance of each object class will exist during execution.  The
+reason for having the class structure is that on different runs we may create
+different instances (choose to execute different modules).  You can think of
+the term "method" as denoting the common interface presented by a particular
+set of interchangeable functions, and "object" as denoting a group of related
+methods, or the total shared interface behavior of a group of modules.
+
+
+*** Overall control structure ***
+
+We previously mentioned the need for overall control logic in the compression
+and decompression libraries.  In IJG implementations prior to v5, overall
+control was mostly provided by "pipeline control" modules, which proved to be
+large, unwieldy, and hard to understand.  To improve the situation, the
+control logic has been subdivided into multiple modules.  The control modules
+consist of:
+
+1. Master control for module selection and initialization.  This has two
+responsibilities:
+
+   1A.  Startup initialization at the beginning of image processing.
+        The individual processing modules to be used in this run are selected
+        and given initialization calls.
+
+   1B.  Per-pass control.  This determines how many passes will be performed
+        and calls each active processing module to configure itself
+        appropriately at the beginning of each pass.  End-of-pass processing,
+	where necessary, is also invoked from the master control module.
+
+   Method selection is partially distributed, in that a particular processing
+   module may contain several possible implementations of a particular method,
+   which it will select among when given its initialization call.  The master
+   control code need only be concerned with decisions that affect more than
+   one module.
+ 
+2. Data buffering control.  A separate control module exists for each
+   inter-processing-step data buffer.  This module is responsible for
+   invoking the processing steps that write or read that data buffer.
+
+Each buffer controller sees the world as follows:
+
+input data => processing step A => buffer => processing step B => output data
+                      |              |               |
+              ------------------ controller ------------------
+
+The controller knows the dataflow requirements of steps A and B: how much data
+they want to accept in one chunk and how much they output in one chunk.  Its
+function is to manage its buffer and call A and B at the proper times.
+
+A data buffer control module may itself be viewed as a processing step by a
+higher-level control module; thus the control modules form a binary tree with
+elementary processing steps at the leaves of the tree.
+
+The control modules are objects.  A considerable amount of flexibility can
+be had by replacing implementations of a control module.  For example:
+* Merging of adjacent steps in the pipeline is done by replacing a control
+  module and its pair of processing-step modules with a single processing-
+  step module.  (Hence the possible merges are determined by the tree of
+  control modules.)
+* In some processing modes, a given interstep buffer need only be a "strip"
+  buffer large enough to accommodate the desired data chunk sizes.  In other
+  modes, a full-image buffer is needed and several passes are required.
+  The control module determines which kind of buffer is used and manipulates
+  virtual array buffers as needed.  One or both processing steps may be
+  unaware of the multi-pass behavior.
+
+In theory, we might be able to make all of the data buffer controllers
+interchangeable and provide just one set of implementations for all.  In
+practice, each one contains considerable special-case processing for its
+particular job.  The buffer controller concept should be regarded as an
+overall system structuring principle, not as a complete description of the
+task performed by any one controller.
+
+
+*** Compression object structure ***
+
+Here is a sketch of the logical structure of the JPEG compression library:
+
+                                                 |-- Colorspace conversion
+                  |-- Preprocessing controller --|
+                  |                              |-- Downsampling
+Main controller --|
+                  |                            |-- Forward DCT, quantize
+                  |-- Coefficient controller --|
+                                               |-- Entropy encoding
+
+This sketch also describes the flow of control (subroutine calls) during
+typical image data processing.  Each of the components shown in the diagram is
+an "object" which may have several different implementations available.  One
+or more source code files contain the actual implementation(s) of each object.
+
+The objects shown above are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the preprocessed input data.  This controller invokes preprocessing to
+  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
+  usually no need for a full-image buffer here; a strip buffer is adequate.
+
+* Preprocessing controller: buffer controller for the downsampling input data
+  buffer, which lies between colorspace conversion and downsampling.  Note
+  that a unified conversion/downsampling module would probably replace this
+  controller entirely.
+
+* Colorspace conversion: converts application image data into the desired
+  JPEG color space; also changes the data from pixel-interleaved layout to
+  separate component planes.  Processes one pixel row at a time.
+
+* Downsampling: performs reduction of chroma components as required.
+  Optionally may perform pixel-level smoothing as well.  Processes a "row
+  group" at a time, where a row group is defined as Vmax pixel rows of each
+  component before downsampling, and Vk sample rows afterwards (remember Vk
+  differs across components).  Some downsampling or smoothing algorithms may
+  require context rows above and below the current row group; the
+  preprocessing controller is responsible for supplying these rows via proper
+  buffering.  The downsampler is responsible for edge expansion at the right
+  edge (i.e., extending each sample row to a multiple of 8 samples); but the
+  preprocessing controller is responsible for vertical edge expansion (i.e.,
+  duplicating the bottom sample row as needed to make a multiple of 8 rows).
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU assembly, including insertion of dummy DCT
+  blocks when needed at the right or bottom edge.  When performing
+  Huffman-code optimization or emitting a multiscan JPEG file, this
+  controller is responsible for buffering the full image.  The equivalent of
+  one fully interleaved MCU row of subsampled data is processed per call,
+  even when the JPEG file is noninterleaved.
+
+* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
+  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
+  emitted in normal array order, which the entropy encoder is expected to
+  convert to zigzag order as necessary.  Prior versions of the IJG code did
+  the conversion to zigzag order within the quantization step.)
+
+* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
+  coded data to the data destination module.  Works on one MCU per call.
+  For progressive JPEG, the same DCT blocks are fed to the entropy coder
+  during each pass, and the coder must emit the appropriate subset of
+  coefficients.
+
+In addition to the above objects, the compression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.
+
+* Marker writing: generates JPEG markers (except for RSTn, which is emitted
+  by the entropy encoder when needed).
+
+* Data destination manager: writes the output JPEG datastream to its final
+  destination (e.g., a file).  The destination manager supplied with the
+  library knows how to write to a stdio stream; for other behaviors, the
+  surrounding application may provide its own destination manager.
+
+* Memory manager: allocates and releases memory, controls virtual arrays
+  (with backing store management, where required).
+
+* Error handler: performs formatting and output of error and trace messages;
+  determines handling of nonfatal errors.  The surrounding application may
+  override some or all of this object's methods to change error handling.
+
+* Progress monitor: supports output of "percent-done" progress reports.
+  This object represents an optional callback to the surrounding application:
+  if wanted, it must be supplied by the application.
+
+The error handler, destination manager, and progress monitor objects are
+defined as separate objects in order to simplify application-specific
+customization of the JPEG library.  A surrounding application may override
+individual methods or supply its own all-new implementation of one of these
+objects.  The object interfaces for these objects are therefore treated as
+part of the application interface of the library, whereas the other objects
+are internal to the library.
+
+The error handler and memory manager are shared by JPEG compression and
+decompression; the progress monitor, if used, may be shared as well.
+
+
+*** Decompression object structure ***
+
+Here is a sketch of the logical structure of the JPEG decompression library:
+
+                                               |-- Entropy decoding
+                  |-- Coefficient controller --|
+                  |                            |-- Dequantize, Inverse DCT
+Main controller --|
+                  |                               |-- Upsampling
+                  |-- Postprocessing controller --|   |-- Colorspace conversion
+                                                  |-- Color quantization
+                                                  |-- Color precision reduction
+
+As before, this diagram also represents typical control flow.  The objects
+shown are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the output of JPEG decompression proper.  This controller's primary
+  task is to feed the postprocessing procedure.  Some upsampling algorithms
+  may require context rows above and below the current row group; when this
+  is true, the main controller is responsible for managing its buffer so as
+  to make context rows available.  In the current design, the main buffer is
+  always a strip buffer; a full-image buffer is never required.
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU disassembly, including deletion of any dummy
+  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
+  file, this controller is responsible for buffering the full image.
+  (Buffering DCT coefficients, rather than samples, is necessary to support
+  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
+  subsampled data is processed per call, even when the source JPEG file is
+  noninterleaved.
+
+* Entropy decoding: Read coded data from the data source module and perform
+  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
+  For progressive JPEG decoding, the coefficient controller supplies the prior
+  coefficients of each MCU (initially all zeroes), which the entropy decoder
+  modifies in each scan.
+
+* Dequantization and inverse DCT: like it says.  Note that the coefficients
+  buffered by the coefficient controller have NOT been dequantized; we
+  merge dequantization and inverse DCT into a single step for speed reasons.
+  When scaled-down output is asked for, simplified DCT algorithms may be used
+  that need fewer coefficients and emit fewer samples per DCT block, not the
+  full 8x8.  Works on one DCT block at a time.
+
+* Postprocessing controller: buffer controller for the color quantization
+  input buffer, when quantization is in use.  (Without quantization, this
+  controller just calls the upsampler.)  For two-pass quantization, this
+  controller is responsible for buffering the full-image data.
+
+* Upsampling: restores chroma components to full size.  (May support more
+  general output rescaling, too.  Note that if undersized DCT outputs have
+  been emitted by the DCT module, this module must adjust so that properly
+  sized outputs are created.)  Works on one row group at a time.  This module
+  also calls the color conversion module, so its top level is effectively a
+  buffer controller for the upsampling->color conversion buffer.  However, in
+  all but the highest-quality operating modes, upsampling and color
+  conversion are likely to be merged into a single step.
+
+* Colorspace conversion: convert from JPEG color space to output color space,
+  and change data layout from separate component planes to pixel-interleaved.
+  Works on one pixel row at a time.
+
+* Color quantization: reduce the data to colormapped form, using either an
+  externally specified colormap or an internally generated one.  This module
+  is not used for full-color output.  Works on one pixel row at a time; may
+  require two passes to generate a color map.  Note that the output will
+  always be a single component representing colormap indexes.  In the current
+  design, the output values are JSAMPLEs, so an 8-bit compilation cannot
+  quantize to more than 256 colors.  This is unlikely to be a problem in
+  practice.
+
+* Color reduction: this module handles color precision reduction, e.g.,
+  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
+  Not quite clear yet how this should be handled... should we merge it with
+  colorspace conversion???
+
+Note that some high-speed operating modes might condense the entire
+postprocessing sequence to a single module (upsample, color convert, and
+quantize in one step).
+
+In addition to the above objects, the decompression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.  This is subdivided into
+  input and output control: jdinput.c controls only input-side processing,
+  while jdmaster.c handles overall initialization and output-side control.
+
+* Marker reading: decodes JPEG markers (except for RSTn).
+
+* Data source manager: supplies the input JPEG datastream.  The source
+  manager supplied with the library knows how to read from a stdio stream;
+  for other behaviors, the surrounding application may provide its own source
+  manager.
+
+* Memory manager: same as for compression library.
+
+* Error handler: same as for compression library.
+
+* Progress monitor: same as for compression library.
+
+As with compression, the data source manager, error handler, and progress
+monitor are candidates for replacement by a surrounding application.
+
+
+*** Decompression input and output separation ***
+
+To support efficient incremental display of progressive JPEG files, the
+decompressor is divided into two sections that can run independently:
+
+1. Data input includes marker parsing, entropy decoding, and input into the
+   coefficient controller's DCT coefficient buffer.  Note that this
+   processing is relatively cheap and fast.
+
+2. Data output reads from the DCT coefficient buffer and performs the IDCT
+   and all postprocessing steps.
+
+For a progressive JPEG file, the data input processing is allowed to get
+arbitrarily far ahead of the data output processing.  (This occurs only
+if the application calls jpeg_consume_input(); otherwise input and output
+run in lockstep, since the input section is called only when the output
+section needs more data.)  In this way the application can avoid making
+extra display passes when data is arriving faster than the display pass
+can run.  Furthermore, it is possible to abort an output pass without
+losing anything, since the coefficient buffer is read-only as far as the
+output section is concerned.  See libjpeg.txt for more detail.
+
+A full-image coefficient array is only created if the JPEG file has multiple
+scans (or if the application specifies buffered-image mode anyway).  When
+reading a single-scan file, the coefficient controller normally creates only
+a one-MCU buffer, so input and output processing must run in lockstep in this
+case.  jpeg_consume_input() is effectively a no-op in this situation.
+
+The main impact of dividing the decompressor in this fashion is that we must
+be very careful with shared variables in the cinfo data structure.  Each
+variable that can change during the course of decompression must be
+classified as belonging to data input or data output, and each section must
+look only at its own variables.  For example, the data output section may not
+depend on any of the variables that describe the current scan in the JPEG
+file, because these may change as the data input section advances into a new
+scan.
+
+The progress monitor is (somewhat arbitrarily) defined to treat input of the
+file as one pass when buffered-image mode is not used, and to ignore data
+input work completely when buffered-image mode is used.  Note that the
+library has no reliable way to predict the number of passes when dealing
+with a progressive JPEG file, nor can it predict the number of output passes
+in buffered-image mode.  So the work estimate is inherently bogus anyway.
+
+No comparable division is currently made in the compression library, because
+there isn't any real need for it.
+
+
+*** Data formats ***
+
+Arrays of pixel sample values use the following data structure:
+
+    typedef something JSAMPLE;		a pixel component value, 0..MAXJSAMPLE
+    typedef JSAMPLE *JSAMPROW;		ptr to a row of samples
+    typedef JSAMPROW *JSAMPARRAY;	ptr to a list of rows
+    typedef JSAMPARRAY *JSAMPIMAGE;	ptr to a list of color-component arrays
+
+The basic element type JSAMPLE will typically be one of unsigned char,
+(signed) char, or short.  Short will be used if samples wider than 8 bits are
+to be supported (this is a compile-time option).  Otherwise, unsigned char is
+used if possible.  If the compiler only supports signed chars, then it is
+necessary to mask off the value when reading.  Thus, all reads of JSAMPLE
+values must be coded as "GETJSAMPLE(value)", where the macro will be defined
+as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
+
+With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
+simplify correct rounding during downsampling, etc.  The JPEG standard's
+specification that sample values run from -128..127 is accommodated by
+subtracting 128 from the sample value in the DCT step.  Similarly, during
+decompression the output of the IDCT step will be immediately shifted back to
+0..255.  (NB: different values are required when 12-bit samples are in use.
+The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
+defined as 255 and 128 respectively in an 8-bit implementation, and as 4095
+and 2048 in a 12-bit implementation.)
+
+We use a pointer per row, rather than a two-dimensional JSAMPLE array.  This
+choice costs only a small amount of memory and has several benefits:
+* Code using the data structure doesn't need to know the allocated width of
+  the rows.  This simplifies edge expansion/compression, since we can work
+  in an array that's wider than the logical picture width.
+* Indexing doesn't require multiplication; this is a performance win on many
+  machines.
+* Arrays with more than 64K total elements can be supported even on machines
+  where malloc() cannot allocate chunks larger than 64K.
+* The rows forming a component array may be allocated at different times
+  without extra copying.  This trick allows some speedups in smoothing steps
+  that need access to the previous and next rows.
+
+Note that each color component is stored in a separate array; we don't use the
+traditional layout in which the components of a pixel are stored together.
+This simplifies coding of modules that work on each component independently,
+because they don't need to know how many components there are.  Furthermore,
+we can read or write each component to a temporary file independently, which
+is helpful when dealing with noninterleaved JPEG files.
+
+In general, a specific sample value is accessed by code such as
+	GETJSAMPLE(image[colorcomponent][row][col])
+where col is measured from the image left edge, but row is measured from the
+first sample row currently in memory.  Either of the first two indexings can
+be precomputed by copying the relevant pointer.
+
+
+Since most image-processing applications prefer to work on images in which
+the components of a pixel are stored together, the data passed to or from the
+surrounding application uses the traditional convention: a single pixel is
+represented by N consecutive JSAMPLE values, and an image row is an array of
+(# of color components)*(image width) JSAMPLEs.  One or more rows of data can
+be represented by a pointer of type JSAMPARRAY in this scheme.  This scheme is
+converted to component-wise storage inside the JPEG library.  (Applications
+that want to skip JPEG preprocessing or postprocessing will have to contend
+with component-wise storage.)
+
+
+Arrays of DCT-coefficient values use the following data structure:
+
+    typedef short JCOEF;		a 16-bit signed integer
+    typedef JCOEF JBLOCK[DCTSIZE2];	an 8x8 block of coefficients
+    typedef JBLOCK *JBLOCKROW;		ptr to one horizontal row of 8x8 blocks
+    typedef JBLOCKROW *JBLOCKARRAY;	ptr to a list of such rows
+    typedef JBLOCKARRAY *JBLOCKIMAGE;	ptr to a list of color component arrays
+
+The underlying type is at least a 16-bit signed integer; while "short" is big
+enough on all machines of interest, on some machines it is preferable to use
+"int" for speed reasons, despite the storage cost.  Coefficients are grouped
+into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
+"8" and "64").
+
+The contents of a coefficient block may be in either "natural" or zigzagged
+order, and may be true values or divided by the quantization coefficients,
+depending on where the block is in the processing pipeline.  In the current
+library, coefficient blocks are kept in natural order everywhere; the entropy
+codecs zigzag or dezigzag the data as it is written or read.  The blocks
+contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
+(This latter decision may need to be revisited to support variable
+quantization a la JPEG Part 3.)
+
+Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
+eight rows of samples.  Otherwise the structure is much the same as for
+samples, and for the same reasons.
+
+On machines where malloc() can't handle a request bigger than 64Kb, this data
+structure limits us to rows of less than 512 JBLOCKs, or a picture width of
+4000+ pixels.  This seems an acceptable restriction.
+
+
+On 80x86 machines, the bottom-level pointer types (JSAMPROW and JBLOCKROW)
+must be declared as "far" pointers, but the upper levels can be "near"
+(implying that the pointer lists are allocated in the DS segment).
+We use a #define symbol FAR, which expands to the "far" keyword when
+compiling on 80x86 machines and to nothing elsewhere.
+
+
+*** Suspendable processing ***
+
+In some applications it is desirable to use the JPEG library as an
+incremental, memory-to-memory filter.  In this situation the data source or
+destination may be a limited-size buffer, and we can't rely on being able to
+empty or refill the buffer at arbitrary times.  Instead the application would
+like to have control return from the library at buffer overflow/underrun, and
+then resume compression or decompression at a later time.
+
+This scenario is supported for simple cases.  (For anything more complex, we
+recommend that the application "bite the bullet" and develop real multitasking
+capability.)  The libjpeg.txt file goes into more detail about the usage and
+limitations of this capability; here we address the implications for library
+structure.
+
+The essence of the problem is that the entropy codec (coder or decoder) must
+be prepared to stop at arbitrary times.  In turn, the controllers that call
+the entropy codec must be able to stop before having produced or consumed all
+the data that they normally would handle in one call.  That part is reasonably
+straightforward: we make the controller call interfaces include "progress
+counters" which indicate the number of data chunks successfully processed, and
+we require callers to test the counter rather than just assume all of the data
+was processed.
+
+Rather than trying to restart at an arbitrary point, the current Huffman
+codecs are designed to restart at the beginning of the current MCU after a
+suspension due to buffer overflow/underrun.  At the start of each call, the
+codec's internal state is loaded from permanent storage (in the JPEG object
+structures) into local variables.  On successful completion of the MCU, the
+permanent state is updated.  (This copying is not very expensive, and may even
+lead to *improved* performance if the local variables can be registerized.)
+If a suspension occurs, the codec simply returns without updating the state,
+thus effectively reverting to the start of the MCU.  Note that this implies
+leaving some data unprocessed in the source/destination buffer (ie, the
+compressed partial MCU).  The data source/destination module interfaces are
+specified so as to make this possible.  This also implies that the data buffer
+must be large enough to hold a worst-case compressed MCU; a couple thousand
+bytes should be enough.
+
+In a successive-approximation AC refinement scan, the progressive Huffman
+decoder has to be able to undo assignments of newly nonzero coefficients if it
+suspends before the MCU is complete, since decoding requires distinguishing
+previously-zero and previously-nonzero coefficients.  This is a bit tedious
+but probably won't have much effect on performance.  Other variants of Huffman
+decoding need not worry about this, since they will just store the same values
+again if forced to repeat the MCU.
+
+This approach would probably not work for an arithmetic codec, since its
+modifiable state is quite large and couldn't be copied cheaply.  Instead it
+would have to suspend and resume exactly at the point of the buffer end.
+
+The JPEG marker reader is designed to cope with suspension at an arbitrary
+point.  It does so by backing up to the start of the marker parameter segment,
+so the data buffer must be big enough to hold the largest marker of interest.
+Again, a couple KB should be adequate.  (A special "skip" convention is used
+to bypass COM and APPn markers, so these can be larger than the buffer size
+without causing problems; otherwise a 64K buffer would be needed in the worst
+case.)
+
+The JPEG marker writer currently does *not* cope with suspension.
+We feel that this is not necessary; it is much easier simply to require
+the application to ensure there is enough buffer space before starting.  (An
+empty 2K buffer is more than sufficient for the header markers; and ensuring
+there are a dozen or two bytes available before calling jpeg_finish_compress()
+will suffice for the trailer.)  This would not work for writing multi-scan
+JPEG files, but we simply do not intend to support that capability with
+suspension.
+
+
+*** Memory manager services ***
+
+The JPEG library's memory manager controls allocation and deallocation of
+memory, and it manages large "virtual" data arrays on machines where the
+operating system does not provide virtual memory.  Note that the same
+memory manager serves both compression and decompression operations.
+
+In all cases, allocated objects are tied to a particular compression or
+decompression master record, and they will be released when that master
+record is destroyed.
+
+The memory manager does not provide explicit deallocation of objects.
+Instead, objects are created in "pools" of free storage, and a whole pool
+can be freed at once.  This approach helps prevent storage-leak bugs, and
+it speeds up operations whenever malloc/free are slow (as they often are).
+The pools can be regarded as lifetime identifiers for objects.  Two
+pools/lifetimes are defined:
+  * JPOOL_PERMANENT	lasts until master record is destroyed
+  * JPOOL_IMAGE		lasts until done with image (JPEG datastream)
+Permanent lifetime is used for parameters and tables that should be carried
+across from one datastream to another; this includes all application-visible
+parameters.  Image lifetime is used for everything else.  (A third lifetime,
+JPOOL_PASS = one processing pass, was originally planned.  However it was
+dropped as not being worthwhile.  The actual usage patterns are such that the
+peak memory usage would be about the same anyway; and having per-pass storage
+substantially complicates the virtual memory allocation rules --- see below.)
+
+The memory manager deals with three kinds of object:
+1. "Small" objects.  Typically these require no more than 10K-20K total.
+2. "Large" objects.  These may require tens to hundreds of K depending on
+   image size.  Semantically they behave the same as small objects, but we
+   distinguish them for two reasons:
+     * On MS-DOS machines, large objects are referenced by FAR pointers,
+       small objects by NEAR pointers.
+     * Pool allocation heuristics may differ for large and small objects.
+   Note that individual "large" objects cannot exceed the size allowed by
+   type size_t, which may be 64K or less on some machines.
+3. "Virtual" objects.  These are large 2-D arrays of JSAMPLEs or JBLOCKs
+   (typically large enough for the entire image being processed).  The
+   memory manager provides stripwise access to these arrays.  On machines
+   without virtual memory, the rest of the array may be swapped out to a
+   temporary file.
+
+(Note: JSAMPARRAY and JBLOCKARRAY data structures are a combination of large
+objects for the data proper and small objects for the row pointers.  For
+convenience and speed, the memory manager provides single routines to create
+these structures.  Similarly, virtual arrays include a small control block
+and a JSAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
+
+In the present implementation, virtual arrays are only permitted to have image
+lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
+not very useful since a virtual array's raison d'etre is to store data for
+multiple passes through the image.)  We also expect that only "small" objects
+will be given permanent lifespan, though this restriction is not required by
+the memory manager.
+
+In a non-virtual-memory machine, some performance benefit can be gained by
+making the in-memory buffers for virtual arrays be as large as possible.
+(For small images, the buffers might fit entirely in memory, so blind
+swapping would be very wasteful.)  The memory manager will adjust the height
+of the buffers to fit within a prespecified maximum memory usage.  In order
+to do this in a reasonably optimal fashion, the manager needs to allocate all
+of the virtual arrays at once.  Therefore, there isn't a one-step allocation
+routine for virtual arrays; instead, there is a "request" routine that simply
+allocates the control block, and a "realize" routine (called just once) that
+determines space allocation and creates all of the actual buffers.  The
+realize routine must allow for space occupied by non-virtual large objects.
+(We don't bother to factor in the space needed for small objects, on the
+grounds that it isn't worth the trouble.)
+
+To support all this, we establish the following protocol for doing business
+with the memory manager:
+  1. Modules must request virtual arrays (which may have only image lifespan)
+     during the initial setup phase, i.e., in their jinit_xxx routines.
+  2. All "large" objects (including JSAMPARRAYs and JBLOCKARRAYs) must also be
+     allocated during initial setup.
+  3. realize_virt_arrays will be called at the completion of initial setup.
+     The above conventions ensure that sufficient information is available
+     for it to choose a good size for virtual array buffers.
+Small objects of any lifespan may be allocated at any time.  We expect that
+the total space used for small objects will be small enough to be negligible
+in the realize_virt_arrays computation.
+
+In a virtual-memory machine, we simply pretend that the available space is
+infinite, thus causing realize_virt_arrays to decide that it can allocate all
+the virtual arrays as full-size in-memory buffers.  The overhead of the
+virtual-array access protocol is very small when no swapping occurs.
+
+A virtual array can be specified to be "pre-zeroed"; when this flag is set,
+never-yet-written sections of the array are set to zero before being made
+available to the caller.  If this flag is not set, never-written sections
+of the array contain garbage.  (This feature exists primarily because the
+equivalent logic would otherwise be needed in jdcoefct.c for progressive
+JPEG mode; we may as well make it available for possible other uses.)
+
+The first write pass on a virtual array is required to occur in top-to-bottom
+order; read passes, as well as any write passes after the first one, may
+access the array in any order.  This restriction exists partly to simplify
+the virtual array control logic, and partly because some file systems may not
+support seeking beyond the current end-of-file in a temporary file.  The main
+implication of this restriction is that rearrangement of rows (such as
+converting top-to-bottom data order to bottom-to-top) must be handled while
+reading data out of the virtual array, not while putting it in.
+
+
+*** Memory manager internal structure ***
+
+To isolate system dependencies as much as possible, we have broken the
+memory manager into two parts.  There is a reasonably system-independent
+"front end" (jmemmgr.c) and a "back end" that contains only the code
+likely to change across systems.  All of the memory management methods
+outlined above are implemented by the front end.  The back end provides
+the following routines for use by the front end (none of these routines
+are known to the rest of the JPEG code):
+
+jpeg_mem_init, jpeg_mem_term	system-dependent initialization/shutdown
+
+jpeg_get_small, jpeg_free_small	interface to malloc and free library routines
+				(or their equivalents)
+
+jpeg_get_large, jpeg_free_large	interface to FAR malloc/free in MSDOS machines;
+				else usually the same as
+				jpeg_get_small/jpeg_free_small
+
+jpeg_mem_available		estimate available memory
+
+jpeg_open_backing_store		create a backing-store object
+
+read_backing_store,		manipulate a backing-store object
+write_backing_store,
+close_backing_store
+
+On some systems there will be more than one type of backing-store object
+(specifically, in MS-DOS a backing store file might be an area of extended
+memory as well as a disk file).  jpeg_open_backing_store is responsible for
+choosing how to implement a given object.  The read/write/close routines
+are method pointers in the structure that describes a given object; this
+lets them be different for different object types.
+
+It may be necessary to ensure that backing store objects are explicitly
+released upon abnormal program termination.  For example, MS-DOS won't free
+extended memory by itself.  To support this, we will expect the main program
+or surrounding application to arrange to call self_destruct (typically via
+jpeg_destroy) upon abnormal termination.  This may require a SIGINT signal
+handler or equivalent.  We don't want to have the back end module install its
+own signal handler, because that would pre-empt the surrounding application's
+ability to control signal handling.
+
+The IJG distribution includes several memory manager back end implementations.
+Usually the same back end should be suitable for all applications on a given
+system, but it is possible for an application to supply its own back end at
+need.
+
+
+*** Implications of DNL marker ***
+
+Some JPEG files may use a DNL marker to postpone definition of the image
+height (this would be useful for a fax-like scanner's output, for instance).
+In these files the SOF marker claims the image height is 0, and you only
+find out the true image height at the end of the first scan.
+
+We could read these files as follows:
+1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
+2. When the DNL is found, update the image height in the global image
+   descriptor.
+This implies that control modules must avoid making copies of the image
+height, and must re-test for termination after each MCU row.  This would
+be easy enough to do.
+
+In cases where image-size data structures are allocated, this approach will
+result in very inefficient use of virtual memory or much-larger-than-necessary
+temporary files.  This seems acceptable for something that probably won't be a
+mainstream usage.  People might have to forgo use of memory-hogging options
+(such as two-pass color quantization or noninterleaved JPEG files) if they
+want efficient conversion of such files.  (One could improve efficiency by
+demanding a user-supplied upper bound for the height, less than 65536; in most
+cases it could be much less.)
+
+The standard also permits the SOF marker to overestimate the image height,
+with a DNL to give the true, smaller height at the end of the first scan.
+This would solve the space problems if the overestimate wasn't too great.
+However, it implies that you don't even know whether DNL will be used.
+
+This leads to a couple of very serious objections:
+1. Testing for a DNL marker must occur in the inner loop of the decompressor's
+   Huffman decoder; this implies a speed penalty whether the feature is used
+   or not.
+2. There is no way to hide the last-minute change in image height from an
+   application using the decoder.  Thus *every* application using the IJG
+   library would suffer a complexity penalty whether it cared about DNL or
+   not.
+We currently do not support DNL because of these problems.
+
+A different approach is to insist that DNL-using files be preprocessed by a
+separate program that reads ahead to the DNL, then goes back and fixes the SOF
+marker.  This is a much simpler solution and is probably far more efficient.
+Even if one wants piped input, buffering the first scan of the JPEG file needs
+a lot smaller temp file than is implied by the maximum-height method.  For
+this approach we'd simply treat DNL as a no-op in the decompressor (at most,
+check that it matches the SOF image height).
+
+We will not worry about making the compressor capable of outputting DNL.
+Something similar to the first scheme above could be applied if anyone ever
+wants to make that work.
diff --git a/jpeg/usage.doc b/jpeg/usage.doc
deleted file mode 100644
index 8c4970af0..000000000
--- a/jpeg/usage.doc
+++ /dev/null
@@ -1,562 +0,0 @@
-USAGE instructions for the Independent JPEG Group's JPEG software
-=================================================================
-
-This file describes usage of the JPEG conversion programs cjpeg and djpeg,
-as well as the utility programs jpegtran, rdjpgcom and wrjpgcom.  (See
-the other documentation files if you wish to use the JPEG library within
-your own programs.)
-
-If you are on a Unix machine you may prefer to read the Unix-style manual
-pages in files cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1.
-
-
-INTRODUCTION
-
-These programs implement JPEG image compression and decompression.  JPEG
-(pronounced "jay-peg") is a standardized compression method for full-color
-and gray-scale images.  JPEG is designed to handle "real-world" scenes,
-for example scanned photographs.  Cartoons, line drawings, and other
-non-realistic images are not JPEG's strong suit; on that sort of material
-you may get poor image quality and/or little compression.
-
-JPEG is lossy, meaning that the output image is not necessarily identical to
-the input image.  Hence you should not use JPEG if you have to have identical
-output bits.  However, on typical real-world images, very good compression
-levels can be obtained with no visible change, and amazingly high compression
-is possible if you can tolerate a low-quality image.  You can trade off image
-quality against file size by adjusting the compressor's "quality" setting.
-
-
-GENERAL USAGE
-
-We provide two programs, cjpeg to compress an image file into JPEG format,
-and djpeg to decompress a JPEG file back into a conventional image format.
-
-On Unix-like systems, you say:
-	cjpeg [switches] [imagefile] >jpegfile
-or
-	djpeg [switches] [jpegfile]  >imagefile
-The programs read the specified input file, or standard input if none is
-named.  They always write to standard output (with trace/error messages to
-standard error).  These conventions are handy for piping images between
-programs.
-
-On most non-Unix systems, you say:
-	cjpeg [switches] imagefile jpegfile
-or
-	djpeg [switches] jpegfile  imagefile
-i.e., both the input and output files are named on the command line.  This
-style is a little more foolproof, and it loses no functionality if you don't
-have pipes.  (You can get this style on Unix too, if you prefer, by defining
-TWO_FILE_COMMANDLINE when you compile the programs; see install.doc.)
-
-You can also say:
-	cjpeg [switches] -outfile jpegfile  imagefile
-or
-	djpeg [switches] -outfile imagefile  jpegfile
-This syntax works on all systems, so it is useful for scripts.
-
-The currently supported image file formats are: PPM (PBMPLUS color format),
-PGM (PBMPLUS gray-scale format), BMP, Targa, and RLE (Utah Raster Toolkit
-format).  (RLE is supported only if the URT library is available.)
-cjpeg recognizes the input image format automatically, with the exception
-of some Targa-format files.  You have to tell djpeg which format to generate.
-
-JPEG files are in the defacto standard JFIF file format.  There are other,
-less widely used JPEG-based file formats, but we don't support them.
-
-All switch names may be abbreviated; for example, -grayscale may be written
--gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
-one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
-British spellings are also accepted (e.g., -greyscale), though for brevity
-these are not mentioned below.
-
-
-CJPEG DETAILS
-
-The basic command line switches for cjpeg are:
-
-	-quality N	Scale quantization tables to adjust image quality.
-			Quality is 0 (worst) to 100 (best); default is 75.
-			(See below for more info.)
-
-	-grayscale	Create monochrome JPEG file from color input.
-			Be sure to use this switch when compressing a grayscale
-			BMP file, because cjpeg isn't bright enough to notice
-			whether a BMP file uses only shades of gray.  By
-			saying -grayscale, you'll get a smaller JPEG file that
-			takes less time to process.
-
-	-optimize	Perform optimization of entropy encoding parameters.
-			Without this, default encoding parameters are used.
-			-optimize usually makes the JPEG file a little smaller,
-			but cjpeg runs somewhat slower and needs much more
-			memory.  Image quality and speed of decompression are
-			unaffected by -optimize.
-
-	-progressive	Create progressive JPEG file (see below).
-
-	-targa		Input file is Targa format.  Targa files that contain
-			an "identification" field will not be automatically
-			recognized by cjpeg; for such files you must specify
-			-targa to make cjpeg treat the input as Targa format.
-			For most Targa files, you won't need this switch.
-
-The -quality switch lets you trade off compressed file size against quality of
-the reconstructed image: the higher the quality setting, the larger the JPEG
-file, and the closer the output image will be to the original input.  Normally
-you want to use the lowest quality setting (smallest file) that decompresses
-into something visually indistinguishable from the original image.  For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right.  If you see defects at -quality 75, then go up 5 or 10
-counts at a time until you are happy with the output image.  (The optimal
-setting will vary from one image to another.)
-
--quality 100 will generate a quantization table of all 1's, minimizing loss
-in the quantization step (but there is still information loss in subsampling,
-as well as roundoff error).  This setting is mainly of interest for
-experimental purposes.  Quality values above about 95 are NOT recommended for
-normal use; the compressed file size goes up dramatically for hardly any gain
-in output image quality.
-
-In the other direction, quality values below 50 will produce very small files
-of low image quality.  Settings around 5 to 10 might be useful in preparing an
-index of a large image library, for example.  Try -quality 2 (or so) for some
-amusing Cubist effects.  (Note: quality values below about 25 generate 2-byte
-quantization tables, which are considered optional in the JPEG standard.
-cjpeg emits a warning message when you give such a quality value, because some
-other JPEG programs may be unable to decode the resulting file.  Use -baseline
-if you need to ensure compatibility at low quality values.)
-
-The -progressive switch creates a "progressive JPEG" file.  In this type of
-JPEG file, the data is stored in multiple scans of increasing quality.  If the
-file is being transmitted over a slow communications link, the decoder can use
-the first scan to display a low-quality image very quickly, and can then
-improve the display with each subsequent scan.  The final image is exactly
-equivalent to a standard JPEG file of the same quality setting, and the total
-file size is about the same --- often a little smaller.  CAUTION: progressive
-JPEG is not yet widely implemented, so many decoders will be unable to view a
-progressive JPEG file at all.
-
-Switches for advanced users:
-
-	-dct int	Use integer DCT method (default).
-	-dct fast	Use fast integer DCT (less accurate).
-	-dct float	Use floating-point DCT method.
-			The float method is very slightly more accurate than
-			the int method, but is much slower unless your machine
-			has very fast floating-point hardware.  Also note that
-			results of the floating-point method may vary slightly
-			across machines, while the integer methods should give
-			the same results everywhere.  The fast integer method
-			is much less accurate than the other two.
-
-	-restart N	Emit a JPEG restart marker every N MCU rows, or every
-			N MCU blocks if "B" is attached to the number.
-			-restart 0 (the default) means no restart markers.
-
-	-smooth N	Smooth the input image to eliminate dithering noise.
-			N, ranging from 1 to 100, indicates the strength of
-			smoothing.  0 (the default) means no smoothing.
-
-	-maxmemory N	Set limit for amount of memory to use in processing
-			large images.  Value is in thousands of bytes, or
-			millions of bytes if "M" is attached to the number.
-			For example, -max 4m selects 4000000 bytes.  If more
-			space is needed, temporary files will be used.
-
-	-verbose	Enable debug printout.  More -v's give more printout.
-	or  -debug	Also, version information is printed at startup.
-
-The -restart option inserts extra markers that allow a JPEG decoder to
-resynchronize after a transmission error.  Without restart markers, any damage
-to a compressed file will usually ruin the image from the point of the error
-to the end of the image; with restart markers, the damage is usually confined
-to the portion of the image up to the next restart marker.  Of course, the
-restart markers occupy extra space.  We recommend -restart 1 for images that
-will be transmitted across unreliable networks such as Usenet.
-
-The -smooth option filters the input to eliminate fine-scale noise.  This is
-often useful when converting dithered images to JPEG: a moderate smoothing
-factor of 10 to 50 gets rid of dithering patterns in the input file, resulting
-in a smaller JPEG file and a better-looking image.  Too large a smoothing
-factor will visibly blur the image, however.
-
-Switches for wizards:
-
-	-baseline	Force baseline-compatible quantization tables to be
-			generated.  This clamps quantization values to 8 bits
-			even at low quality settings.  (This switch is poorly
-			named, since it does not ensure that the output is
-			actually baseline JPEG.  For example, you can use
-			-baseline and -progressive together.)
-
-	-qtables file	Use the quantization tables given in the specified
-			text file.
-
-	-qslots N[,...] Select which quantization table to use for each color
-			component.
-
-	-sample HxV[,...]  Set JPEG sampling factors for each color component.
-
-	-scans file	Use the scan script given in the specified text file.
-
-The "wizard" switches are intended for experimentation with JPEG.  If you
-don't know what you are doing, DON'T USE THEM.  These switches are documented
-further in the file wizard.doc.
-
-
-DJPEG DETAILS
-
-The basic command line switches for djpeg are:
-
-	-colors N	Reduce image to at most N colors.  This reduces the
-	or -quantize N	number of colors used in the output image, so that it
-			can be displayed on a colormapped display or stored in
-			a colormapped file format.  For example, if you have
-			an 8-bit display, you'd need to reduce to 256 or fewer
-			colors.  (-colors is the recommended name, -quantize
-			is provided only for backwards compatibility.)
-
-	-fast		Select recommended processing options for fast, low
-			quality output.  (The default options are chosen for
-			highest quality output.)  Currently, this is equivalent
-			to "-dct fast -nosmooth -onepass -dither ordered".
-
-	-grayscale	Force gray-scale output even if JPEG file is color.
-			Useful for viewing on monochrome displays; also,
-			djpeg runs noticeably faster in this mode.
-
-	-scale M/N	Scale the output image by a factor M/N.  Currently
-			the scale factor must be 1/1, 1/2, 1/4, or 1/8.
-			Scaling is handy if the image is larger than your
-			screen; also, djpeg runs much faster when scaling
-			down the output.
-
-	-bmp		Select BMP output format (Windows flavor).  8-bit
-			colormapped format is emitted if -colors or -grayscale
-			is specified, or if the JPEG file is gray-scale;
-			otherwise, 24-bit full-color format is emitted.
-
-	-gif		Select GIF output format.  Since GIF does not support
-			more than 256 colors, -colors 256 is assumed (unless
-			you specify a smaller number of colors).  If you
-			specify -fast, the default number of colors is 216.
-
-	-os2		Select BMP output format (OS/2 1.x flavor).  8-bit
-			colormapped format is emitted if -colors or -grayscale
-			is specified, or if the JPEG file is gray-scale;
-			otherwise, 24-bit full-color format is emitted.
-
-	-pnm		Select PBMPLUS (PPM/PGM) output format (this is the
-			default format).  PGM is emitted if the JPEG file is
-			gray-scale or if -grayscale is specified; otherwise
-			PPM is emitted.
-
-	-rle		Select RLE output format.  (Requires URT library.)
-
-	-targa		Select Targa output format.  Gray-scale format is
-			emitted if the JPEG file is gray-scale or if
-			-grayscale is specified; otherwise, colormapped format
-			is emitted if -colors is specified; otherwise, 24-bit
-			full-color format is emitted.
-
-Switches for advanced users:
-
-	-dct int	Use integer DCT method (default).
-	-dct fast	Use fast integer DCT (less accurate).
-	-dct float	Use floating-point DCT method.
-			The float method is very slightly more accurate than
-			the int method, but is much slower unless your machine
-			has very fast floating-point hardware.  Also note that
-			results of the floating-point method may vary slightly
-			across machines, while the integer methods should give
-			the same results everywhere.  The fast integer method
-			is much less accurate than the other two.
-
-	-dither fs	Use Floyd-Steinberg dithering in color quantization.
-	-dither ordered	Use ordered dithering in color quantization.
-	-dither none	Do not use dithering in color quantization.
-			By default, Floyd-Steinberg dithering is applied when
-			quantizing colors; this is slow but usually produces
-			the best results.  Ordered dither is a compromise
-			between speed and quality; no dithering is fast but
-			usually looks awful.  Note that these switches have
-			no effect unless color quantization is being done.
-			Ordered dither is only available in -onepass mode.
-
-	-map FILE	Quantize to the colors used in the specified image
-			file.  This is useful for producing multiple files
-			with identical color maps, or for forcing a predefined
-			set of colors to be used.  The FILE must be a GIF
-			or PPM file.  This option overrides -colors and
-			-onepass.
-
-	-nosmooth	Use a faster, lower-quality upsampling routine.
-
-	-onepass	Use one-pass instead of two-pass color quantization.
-			The one-pass method is faster and needs less memory,
-			but it produces a lower-quality image.  -onepass is
-			ignored unless you also say -colors N.  Also,
-			the one-pass method is always used for gray-scale
-			output (the two-pass method is no improvement then).
-
-	-maxmemory N	Set limit for amount of memory to use in processing
-			large images.  Value is in thousands of bytes, or
-			millions of bytes if "M" is attached to the number.
-			For example, -max 4m selects 4000000 bytes.  If more
-			space is needed, temporary files will be used.
-
-	-verbose	Enable debug printout.  More -v's give more printout.
-	or  -debug	Also, version information is printed at startup.
-
-
-HINTS FOR CJPEG
-
-Color GIF files are not the ideal input for JPEG; JPEG is really intended for
-compressing full-color (24-bit) images.  In particular, don't try to convert
-cartoons, line drawings, and other images that have only a few distinct
-colors.  GIF works great on these, JPEG does not.  If you want to convert a
-GIF to JPEG, you should experiment with cjpeg's -quality and -smooth options
-to get a satisfactory conversion.  -smooth 10 or so is often helpful.
-
-Avoid running an image through a series of JPEG compression/decompression
-cycles.  Image quality loss will accumulate; after ten or so cycles the image
-may be noticeably worse than it was after one cycle.  It's best to use a
-lossless format while manipulating an image, then convert to JPEG format when
-you are ready to file the image away.
-
-The -optimize option to cjpeg is worth using when you are making a "final"
-version for posting or archiving.  It's also a win when you are using low
-quality settings to make very small JPEG files; the percentage improvement
-is often a lot more than it is on larger files.  (At present, -optimize
-mode is always selected when generating progressive JPEG files.)
-
-GIF input files are no longer supported, to avoid the Unisys LZW patent.
-Use a Unisys-licensed program if you need to read a GIF file.  (Conversion
-of GIF files to JPEG is usually a bad idea anyway.)
-
-
-HINTS FOR DJPEG
-
-To get a quick preview of an image, use the -grayscale and/or -scale switches.
-"-grayscale -scale 1/8" is the fastest case.
-
-Several options are available that trade off image quality to gain speed.
-"-fast" turns on the recommended settings.
-
-"-dct fast" and/or "-nosmooth" gain speed at a small sacrifice in quality.
-When producing a color-quantized image, "-onepass -dither ordered" is fast but
-much lower quality than the default behavior.  "-dither none" may give
-acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
-
-If you are fortunate enough to have very fast floating point hardware,
-"-dct float" may be even faster than "-dct fast".  But on most machines
-"-dct float" is slower than "-dct int"; in this case it is not worth using,
-because its theoretical accuracy advantage is too small to be significant
-in practice.
-
-Two-pass color quantization requires a good deal of memory; on MS-DOS machines
-it may run out of memory even with -maxmemory 0.  In that case you can still
-decompress, with some loss of image quality, by specifying -onepass for
-one-pass quantization.
-
-To avoid the Unisys LZW patent, djpeg produces uncompressed GIF files.  These
-are larger than they should be, but are readable by standard GIF decoders.
-
-
-HINTS FOR BOTH PROGRAMS
-
-If more space is needed than will fit in the available main memory (as
-determined by -maxmemory), temporary files will be used.  (MS-DOS versions
-will try to get extended or expanded memory first.)  The temporary files are
-often rather large: in typical cases they occupy three bytes per pixel, for
-example 3*800*600 = 1.44Mb for an 800x600 image.  If you don't have enough
-free disk space, leave out -progressive and -optimize (for cjpeg) or specify
--onepass (for djpeg).
-
-On MS-DOS, the temporary files are created in the directory named by the TMP
-or TEMP environment variable, or in the current directory if neither of those
-exist.  Amiga implementations put the temp files in the directory named by
-JPEGTMP:, so be sure to assign JPEGTMP: to a disk partition with adequate free
-space.
-
-The default memory usage limit (-maxmemory) is set when the software is
-compiled.  If you get an "insufficient memory" error, try specifying a smaller
--maxmemory value, even -maxmemory 0 to use the absolute minimum space.  You
-may want to recompile with a smaller default value if this happens often.
-
-On machines that have "environment" variables, you can define the environment
-variable JPEGMEM to set the default memory limit.  The value is specified as
-described for the -maxmemory switch.  JPEGMEM overrides the default value
-specified when the program was compiled, and itself is overridden by an
-explicit -maxmemory switch.
-
-On MS-DOS machines, -maxmemory is the amount of main (conventional) memory to
-use.  (Extended or expanded memory is also used if available.)  Most
-DOS-specific versions of this software do their own memory space estimation
-and do not need you to specify -maxmemory.
-
-
-JPEGTRAN
-
-jpegtran performs various useful transformations of JPEG files.
-It can translate the coded representation from one variant of JPEG to another,
-for example from baseline JPEG to progressive JPEG or vice versa.  It can also
-perform some rearrangements of the image data, for example turning an image
-from landscape to portrait format by rotation.
-
-jpegtran works by rearranging the compressed data (DCT coefficients), without
-ever fully decoding the image.  Therefore, its transformations are lossless:
-there is no image degradation at all, which would not be true if you used
-djpeg followed by cjpeg to accomplish the same conversion.  But by the same
-token, jpegtran cannot perform lossy operations such as changing the image
-quality.
-
-jpegtran uses a command line syntax similar to cjpeg or djpeg.
-On Unix-like systems, you say:
-	jpegtran [switches] [inputfile] >outputfile
-On most non-Unix systems, you say:
-	jpegtran [switches] inputfile outputfile
-where both the input and output files are JPEG files.
-
-To specify the coded JPEG representation used in the output file,
-jpegtran accepts a subset of the switches recognized by cjpeg:
-	-optimize	Perform optimization of entropy encoding parameters.
-	-progressive	Create progressive JPEG file.
-	-restart N	Emit a JPEG restart marker every N MCU rows, or every
-			N MCU blocks if "B" is attached to the number.
-	-scans file	Use the scan script given in the specified text file.
-See the previous discussion of cjpeg for more details about these switches.
-If you specify none of these switches, you get a plain baseline-JPEG output
-file.  The quality setting and so forth are determined by the input file.
-
-The image can be losslessly transformed by giving one of these switches:
-	-flip horizontal	Mirror image horizontally (left-right).
-	-flip vertical		Mirror image vertically (top-bottom).
-	-rotate 90		Rotate image 90 degrees clockwise.
-	-rotate 180		Rotate image 180 degrees.
-	-rotate 270		Rotate image 270 degrees clockwise (or 90 ccw).
-	-transpose		Transpose image (across UL-to-LR axis).
-	-transverse		Transverse transpose (across UR-to-LL axis).
-
-The transpose transformation has no restrictions regarding image dimensions.
-The other transformations operate rather oddly if the image dimensions are not
-a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
-transform complete blocks of DCT coefficient data in the desired way.
-
-jpegtran's default behavior when transforming an odd-size image is designed
-to preserve exact reversibility and mathematical consistency of the
-transformation set.  As stated, transpose is able to flip the entire image
-area.  Horizontal mirroring leaves any partial iMCU column at the right edge
-untouched, but is able to flip all rows of the image.  Similarly, vertical
-mirroring leaves any partial iMCU row at the bottom edge untouched, but is
-able to flip all columns.  The other transforms can be built up as sequences
-of transpose and flip operations; for consistency, their actions on edge
-pixels are defined to be the same as the end result of the corresponding
-transpose-and-flip sequence.
-
-For practical use, you may prefer to discard any untransformable edge pixels
-rather than having a strange-looking strip along the right and/or bottom edges
-of a transformed image.  To do this, add the -trim switch:
-	-trim		Drop non-transformable edge blocks.
-Obviously, a transformation with -trim is not reversible, so strictly speaking
-jpegtran with this switch is not lossless.  Also, the expected mathematical
-equivalences between the transformations no longer hold.  For example,
-"-rot 270 -trim" trims only the bottom edge, but "-rot 90 -trim" followed by
-"-rot 180 -trim" trims both edges.
-
-Another not-strictly-lossless transformation switch is:
-	-grayscale	Force grayscale output.
-This option discards the chrominance channels if the input image is YCbCr
-(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
-luminance channel is preserved exactly, so this is a better method of reducing
-to grayscale than decompression, conversion, and recompression.  This switch
-is particularly handy for fixing a monochrome picture that was mistakenly
-encoded as a color JPEG.  (In such a case, the space savings from getting rid
-of the near-empty chroma channels won't be large; but the decoding time for
-a grayscale JPEG is substantially less than that for a color JPEG.)
-
-jpegtran also recognizes these switches that control what to do with "extra"
-markers, such as comment blocks:
-	-copy none	Copy no extra markers from source file.  This setting
-			suppresses all comments and other excess baggage
-			present in the source file.
-	-copy comments	Copy only comment markers.  This setting copies
-			comments from the source file, but discards
-			any other inessential data. 
-	-copy all	Copy all extra markers.  This setting preserves
-			miscellaneous markers found in the source file, such
-			as JFIF thumbnails and Photoshop settings.  In some
-			files these extra markers can be sizable.
-The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
-jpegtran always did the equivalent of -copy none.)
-
-Additional switches recognized by jpegtran are:
-	-outfile filename
-	-maxmemory N
-	-verbose
-	-debug
-These work the same as in cjpeg or djpeg.
-
-
-THE COMMENT UTILITIES
-
-The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
-Although the standard doesn't actually define what COM blocks are for, they
-are widely used to hold user-supplied text strings.  This lets you add
-annotations, titles, index terms, etc to your JPEG files, and later retrieve
-them as text.  COM blocks do not interfere with the image stored in the JPEG
-file.  The maximum size of a COM block is 64K, but you can have as many of
-them as you like in one JPEG file.
-
-We provide two utility programs to display COM block contents and add COM
-blocks to a JPEG file.
-
-rdjpgcom searches a JPEG file and prints the contents of any COM blocks on
-standard output.  The command line syntax is
-	rdjpgcom [-verbose] [inputfilename]
-The switch "-verbose" (or just "-v") causes rdjpgcom to also display the JPEG
-image dimensions.  If you omit the input file name from the command line,
-the JPEG file is read from standard input.  (This may not work on some
-operating systems, if binary data can't be read from stdin.)
-
-wrjpgcom adds a COM block, containing text you provide, to a JPEG file.
-Ordinarily, the COM block is added after any existing COM blocks, but you
-can delete the old COM blocks if you wish.  wrjpgcom produces a new JPEG
-file; it does not modify the input file.  DO NOT try to overwrite the input
-file by directing wrjpgcom's output back into it; on most systems this will
-just destroy your file.
-
-The command line syntax for wrjpgcom is similar to cjpeg's.  On Unix-like
-systems, it is
-	wrjpgcom [switches] [inputfilename]
-The output file is written to standard output.  The input file comes from
-the named file, or from standard input if no input file is named.
-
-On most non-Unix systems, the syntax is
-	wrjpgcom [switches] inputfilename outputfilename
-where both input and output file names must be given explicitly.
-
-wrjpgcom understands three switches:
-	-replace		 Delete any existing COM blocks from the file.
-	-comment "Comment text"	 Supply new COM text on command line.
-	-cfile name		 Read text for new COM block from named file.
-(Switch names can be abbreviated.)  If you have only one line of comment text
-to add, you can provide it on the command line with -comment.  The comment
-text must be surrounded with quotes so that it is treated as a single
-argument.  Longer comments can be read from a text file.
-
-If you give neither -comment nor -cfile, then wrjpgcom will read the comment
-text from standard input.  (In this case an input image file name MUST be
-supplied, so that the source JPEG file comes from somewhere else.)  You can
-enter multiple lines, up to 64KB worth.  Type an end-of-file indicator
-(usually control-D or control-Z) to terminate the comment text entry.
-
-wrjpgcom will not add a COM block if the provided comment string is empty.
-Therefore -replace -comment "" can be used to delete all COM blocks from a
-file.
-
-These utility programs do not depend on the IJG JPEG library.  In
-particular, the source code for rdjpgcom is intended as an illustration of
-the minimum amount of code required to parse a JPEG file header correctly.
diff --git a/jpeg/usage.txt b/jpeg/usage.txt
new file mode 100644
index 000000000..6e8546a6a
--- /dev/null
+++ b/jpeg/usage.txt
@@ -0,0 +1,617 @@
+USAGE instructions for the Independent JPEG Group's JPEG software
+=================================================================
+
+This file describes usage of the JPEG conversion programs cjpeg and djpeg,
+as well as the utility programs jpegtran, rdjpgcom and wrjpgcom.  (See
+the other documentation files if you wish to use the JPEG library within
+your own programs.)
+
+If you are on a Unix machine you may prefer to read the Unix-style manual
+pages in files cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1.
+
+
+INTRODUCTION
+
+These programs implement JPEG image encoding, decoding, and transcoding.
+JPEG (pronounced "jay-peg") is a standardized compression method for
+full-color and gray-scale images.
+
+
+GENERAL USAGE
+
+We provide two programs, cjpeg to compress an image file into JPEG format,
+and djpeg to decompress a JPEG file back into a conventional image format.
+
+On Unix-like systems, you say:
+	cjpeg [switches] [imagefile] >jpegfile
+or
+	djpeg [switches] [jpegfile]  >imagefile
+The programs read the specified input file, or standard input if none is
+named.  They always write to standard output (with trace/error messages to
+standard error).  These conventions are handy for piping images between
+programs.
+
+On most non-Unix systems, you say:
+	cjpeg [switches] imagefile jpegfile
+or
+	djpeg [switches] jpegfile  imagefile
+i.e., both the input and output files are named on the command line.  This
+style is a little more foolproof, and it loses no functionality if you don't
+have pipes.  (You can get this style on Unix too, if you prefer, by defining
+TWO_FILE_COMMANDLINE when you compile the programs; see install.txt.)
+
+You can also say:
+	cjpeg [switches] -outfile jpegfile  imagefile
+or
+	djpeg [switches] -outfile imagefile  jpegfile
+This syntax works on all systems, so it is useful for scripts.
+
+The currently supported image file formats are: PPM (PBMPLUS color format),
+PGM (PBMPLUS gray-scale format), BMP, Targa, and RLE (Utah Raster Toolkit
+format).  (RLE is supported only if the URT library is available.)
+cjpeg recognizes the input image format automatically, with the exception
+of some Targa-format files.  You have to tell djpeg which format to generate.
+
+JPEG files are in the defacto standard JFIF file format.  There are other,
+less widely used JPEG-based file formats, but we don't support them.
+
+All switch names may be abbreviated; for example, -grayscale may be written
+-gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
+one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
+British spellings are also accepted (e.g., -greyscale), though for brevity
+these are not mentioned below.
+
+
+CJPEG DETAILS
+
+The basic command line switches for cjpeg are:
+
+	-quality N[,...]  Scale quantization tables to adjust image quality.
+			Quality is 0 (worst) to 100 (best); default is 75.
+			(See below for more info.)
+
+	-grayscale	Create monochrome JPEG file from color input.
+			Be sure to use this switch when compressing a grayscale
+			BMP file, because cjpeg isn't bright enough to notice
+			whether a BMP file uses only shades of gray.  By
+			saying -grayscale, you'll get a smaller JPEG file that
+			takes less time to process.
+
+	-optimize	Perform optimization of entropy encoding parameters.
+			Without this, default encoding parameters are used.
+			-optimize usually makes the JPEG file a little smaller,
+			but cjpeg runs somewhat slower and needs much more
+			memory.  Image quality and speed of decompression are
+			unaffected by -optimize.
+
+	-progressive	Create progressive JPEG file (see below).
+
+	-scale M/N	Scale the output image by a factor M/N.  Currently
+			supported scale factors are 8/N with all N from 1 to
+			16.
+
+	-targa		Input file is Targa format.  Targa files that contain
+			an "identification" field will not be automatically
+			recognized by cjpeg; for such files you must specify
+			-targa to make cjpeg treat the input as Targa format.
+			For most Targa files, you won't need this switch.
+
+The -quality switch lets you trade off compressed file size against quality of
+the reconstructed image: the higher the quality setting, the larger the JPEG
+file, and the closer the output image will be to the original input.  Normally
+you want to use the lowest quality setting (smallest file) that decompresses
+into something visually indistinguishable from the original image.  For this
+purpose the quality setting should be between 50 and 95; the default of 75 is
+often about right.  If you see defects at -quality 75, then go up 5 or 10
+counts at a time until you are happy with the output image.  (The optimal
+setting will vary from one image to another.)
+
+-quality 100 will generate a quantization table of all 1's, minimizing loss
+in the quantization step (but there is still information loss in subsampling,
+as well as roundoff error).  This setting is mainly of interest for
+experimental purposes.  Quality values above about 95 are NOT recommended for
+normal use; the compressed file size goes up dramatically for hardly any gain
+in output image quality.
+
+In the other direction, quality values below 50 will produce very small files
+of low image quality.  Settings around 5 to 10 might be useful in preparing an
+index of a large image library, for example.  Try -quality 2 (or so) for some
+amusing Cubist effects.  (Note: quality values below about 25 generate 2-byte
+quantization tables, which are considered optional in the JPEG standard.
+cjpeg emits a warning message when you give such a quality value, because some
+other JPEG programs may be unable to decode the resulting file.  Use -baseline
+if you need to ensure compatibility at low quality values.)
+
+The -quality option has been extended in IJG version 7 for support of separate
+quality settings for luminance and chrominance (or in general, for every
+provided quantization table slot).  This feature is useful for high-quality
+applications which cannot accept the damage of color data by coarse
+subsampling settings.  You can now easily reduce the color data amount more
+smoothly with finer control without separate subsampling.  The resulting file
+is fully compliant with standard JPEG decoders.
+Note that the -quality ratings refer to the quantization table slots, and that
+the last value is replicated if there are more q-table slots than parameters.
+The default q-table slots are 0 for luminance and 1 for chrominance with
+default tables as given in the JPEG standard.  This is compatible with the old
+behaviour in case that only one parameter is given, which is then used for
+both luminance and chrominance (slots 0 and 1).  More or custom quantization
+tables can be set with -qtables and assigned to components with -qslots
+parameter (see the "wizard" switches below).
+CAUTION: You must explicitly add -sample 1x1 for efficient separate color
+quality selection, since the default value used by library is 2x2!
+
+The -progressive switch creates a "progressive JPEG" file.  In this type of
+JPEG file, the data is stored in multiple scans of increasing quality.  If the
+file is being transmitted over a slow communications link, the decoder can use
+the first scan to display a low-quality image very quickly, and can then
+improve the display with each subsequent scan.  The final image is exactly
+equivalent to a standard JPEG file of the same quality setting, and the total
+file size is about the same --- often a little smaller.
+
+Switches for advanced users:
+
+	-dct int	Use integer DCT method (default).
+	-dct fast	Use fast integer DCT (less accurate).
+	-dct float	Use floating-point DCT method.
+			The float method is very slightly more accurate than
+			the int method, but is much slower unless your machine
+			has very fast floating-point hardware.  Also note that
+			results of the floating-point method may vary slightly
+			across machines, while the integer methods should give
+			the same results everywhere.  The fast integer method
+			is much less accurate than the other two.
+
+	-nosmooth	Don't use high-quality downsampling.
+
+	-restart N	Emit a JPEG restart marker every N MCU rows, or every
+			N MCU blocks if "B" is attached to the number.
+			-restart 0 (the default) means no restart markers.
+
+	-smooth N	Smooth the input image to eliminate dithering noise.
+			N, ranging from 1 to 100, indicates the strength of
+			smoothing.  0 (the default) means no smoothing.
+
+	-maxmemory N	Set limit for amount of memory to use in processing
+			large images.  Value is in thousands of bytes, or
+			millions of bytes if "M" is attached to the number.
+			For example, -max 4m selects 4000000 bytes.  If more
+			space is needed, temporary files will be used.
+
+	-verbose	Enable debug printout.  More -v's give more printout.
+	or  -debug	Also, version information is printed at startup.
+
+The -restart option inserts extra markers that allow a JPEG decoder to
+resynchronize after a transmission error.  Without restart markers, any damage
+to a compressed file will usually ruin the image from the point of the error
+to the end of the image; with restart markers, the damage is usually confined
+to the portion of the image up to the next restart marker.  Of course, the
+restart markers occupy extra space.  We recommend -restart 1 for images that
+will be transmitted across unreliable networks such as Usenet.
+
+The -smooth option filters the input to eliminate fine-scale noise.  This is
+often useful when converting dithered images to JPEG: a moderate smoothing
+factor of 10 to 50 gets rid of dithering patterns in the input file, resulting
+in a smaller JPEG file and a better-looking image.  Too large a smoothing
+factor will visibly blur the image, however.
+
+Switches for wizards:
+
+	-arithmetic	Use arithmetic coding.  CAUTION: arithmetic coded JPEG
+			is not yet widely implemented, so many decoders will
+			be unable to view an arithmetic coded JPEG file at
+			all.
+
+	-baseline	Force baseline-compatible quantization tables to be
+			generated.  This clamps quantization values to 8 bits
+			even at low quality settings.  (This switch is poorly
+			named, since it does not ensure that the output is
+			actually baseline JPEG.  For example, you can use
+			-baseline and -progressive together.)
+
+	-qtables file	Use the quantization tables given in the specified
+			text file.
+
+	-qslots N[,...] Select which quantization table to use for each color
+			component.
+
+	-sample HxV[,...]  Set JPEG sampling factors for each color component.
+
+	-scans file	Use the scan script given in the specified text file.
+
+The "wizard" switches are intended for experimentation with JPEG.  If you
+don't know what you are doing, DON'T USE THEM.  These switches are documented
+further in the file wizard.txt.
+
+
+DJPEG DETAILS
+
+The basic command line switches for djpeg are:
+
+	-colors N	Reduce image to at most N colors.  This reduces the
+	or -quantize N	number of colors used in the output image, so that it
+			can be displayed on a colormapped display or stored in
+			a colormapped file format.  For example, if you have
+			an 8-bit display, you'd need to reduce to 256 or fewer
+			colors.  (-colors is the recommended name, -quantize
+			is provided only for backwards compatibility.)
+
+	-fast		Select recommended processing options for fast, low
+			quality output.  (The default options are chosen for
+			highest quality output.)  Currently, this is equivalent
+			to "-dct fast -nosmooth -onepass -dither ordered".
+
+	-grayscale	Force gray-scale output even if JPEG file is color.
+			Useful for viewing on monochrome displays; also,
+			djpeg runs noticeably faster in this mode.
+
+	-scale M/N	Scale the output image by a factor M/N.  Currently
+			supported scale factors are M/N with all M from 1 to
+			16, where N is the source DCT size, which is 8 for
+			baseline JPEG.  If the /N part is omitted, then M
+			specifies the DCT scaled size to be applied on the
+			given input.  For baseline JPEG this is equivalent to
+			M/8 scaling, since the source DCT size for baseline
+			JPEG is 8.  Scaling is handy if the image is larger
+			than your screen; also, djpeg runs much faster when
+			scaling down the output.
+
+	-bmp		Select BMP output format (Windows flavor).  8-bit
+			colormapped format is emitted if -colors or -grayscale
+			is specified, or if the JPEG file is gray-scale;
+			otherwise, 24-bit full-color format is emitted.
+
+	-gif		Select GIF output format.  Since GIF does not support
+			more than 256 colors, -colors 256 is assumed (unless
+			you specify a smaller number of colors).  If you
+			specify -fast, the default number of colors is 216.
+
+	-os2		Select BMP output format (OS/2 1.x flavor).  8-bit
+			colormapped format is emitted if -colors or -grayscale
+			is specified, or if the JPEG file is gray-scale;
+			otherwise, 24-bit full-color format is emitted.
+
+	-pnm		Select PBMPLUS (PPM/PGM) output format (this is the
+			default format).  PGM is emitted if the JPEG file is
+			gray-scale or if -grayscale is specified; otherwise
+			PPM is emitted.
+
+	-rle		Select RLE output format.  (Requires URT library.)
+
+	-targa		Select Targa output format.  Gray-scale format is
+			emitted if the JPEG file is gray-scale or if
+			-grayscale is specified; otherwise, colormapped format
+			is emitted if -colors is specified; otherwise, 24-bit
+			full-color format is emitted.
+
+Switches for advanced users:
+
+	-dct int	Use integer DCT method (default).
+	-dct fast	Use fast integer DCT (less accurate).
+	-dct float	Use floating-point DCT method.
+			The float method is very slightly more accurate than
+			the int method, but is much slower unless your machine
+			has very fast floating-point hardware.  Also note that
+			results of the floating-point method may vary slightly
+			across machines, while the integer methods should give
+			the same results everywhere.  The fast integer method
+			is much less accurate than the other two.
+
+	-dither fs	Use Floyd-Steinberg dithering in color quantization.
+	-dither ordered	Use ordered dithering in color quantization.
+	-dither none	Do not use dithering in color quantization.
+			By default, Floyd-Steinberg dithering is applied when
+			quantizing colors; this is slow but usually produces
+			the best results.  Ordered dither is a compromise
+			between speed and quality; no dithering is fast but
+			usually looks awful.  Note that these switches have
+			no effect unless color quantization is being done.
+			Ordered dither is only available in -onepass mode.
+
+	-map FILE	Quantize to the colors used in the specified image
+			file.  This is useful for producing multiple files
+			with identical color maps, or for forcing a predefined
+			set of colors to be used.  The FILE must be a GIF
+			or PPM file.  This option overrides -colors and
+			-onepass.
+
+	-nosmooth	Don't use high-quality upsampling.
+
+	-onepass	Use one-pass instead of two-pass color quantization.
+			The one-pass method is faster and needs less memory,
+			but it produces a lower-quality image.  -onepass is
+			ignored unless you also say -colors N.  Also,
+			the one-pass method is always used for gray-scale
+			output (the two-pass method is no improvement then).
+
+	-maxmemory N	Set limit for amount of memory to use in processing
+			large images.  Value is in thousands of bytes, or
+			millions of bytes if "M" is attached to the number.
+			For example, -max 4m selects 4000000 bytes.  If more
+			space is needed, temporary files will be used.
+
+	-verbose	Enable debug printout.  More -v's give more printout.
+	or  -debug	Also, version information is printed at startup.
+
+
+HINTS FOR CJPEG
+
+Color GIF files are not the ideal input for JPEG; JPEG is really intended for
+compressing full-color (24-bit) images.  In particular, don't try to convert
+cartoons, line drawings, and other images that have only a few distinct
+colors.  GIF works great on these, JPEG does not.  If you want to convert a
+GIF to JPEG, you should experiment with cjpeg's -quality and -smooth options
+to get a satisfactory conversion.  -smooth 10 or so is often helpful.
+
+Avoid running an image through a series of JPEG compression/decompression
+cycles.  Image quality loss will accumulate; after ten or so cycles the image
+may be noticeably worse than it was after one cycle.  It's best to use a
+lossless format while manipulating an image, then convert to JPEG format when
+you are ready to file the image away.
+
+The -optimize option to cjpeg is worth using when you are making a "final"
+version for posting or archiving.  It's also a win when you are using low
+quality settings to make very small JPEG files; the percentage improvement
+is often a lot more than it is on larger files.  (At present, -optimize
+mode is always selected when generating progressive JPEG files.)
+
+GIF input files are no longer supported, to avoid the Unisys LZW patent.
+(Conversion of GIF files to JPEG is usually a bad idea anyway.)
+
+
+HINTS FOR DJPEG
+
+To get a quick preview of an image, use the -grayscale and/or -scale switches.
+"-grayscale -scale 1/8" is the fastest case.
+
+Several options are available that trade off image quality to gain speed.
+"-fast" turns on the recommended settings.
+
+"-dct fast" and/or "-nosmooth" gain speed at a small sacrifice in quality.
+When producing a color-quantized image, "-onepass -dither ordered" is fast but
+much lower quality than the default behavior.  "-dither none" may give
+acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
+
+If you are fortunate enough to have very fast floating point hardware,
+"-dct float" may be even faster than "-dct fast".  But on most machines
+"-dct float" is slower than "-dct int"; in this case it is not worth using,
+because its theoretical accuracy advantage is too small to be significant
+in practice.
+
+Two-pass color quantization requires a good deal of memory; on MS-DOS machines
+it may run out of memory even with -maxmemory 0.  In that case you can still
+decompress, with some loss of image quality, by specifying -onepass for
+one-pass quantization.
+
+To avoid the Unisys LZW patent, djpeg produces uncompressed GIF files.  These
+are larger than they should be, but are readable by standard GIF decoders.
+
+
+HINTS FOR BOTH PROGRAMS
+
+If more space is needed than will fit in the available main memory (as
+determined by -maxmemory), temporary files will be used.  (MS-DOS versions
+will try to get extended or expanded memory first.)  The temporary files are
+often rather large: in typical cases they occupy three bytes per pixel, for
+example 3*800*600 = 1.44Mb for an 800x600 image.  If you don't have enough
+free disk space, leave out -progressive and -optimize (for cjpeg) or specify
+-onepass (for djpeg).
+
+On MS-DOS, the temporary files are created in the directory named by the TMP
+or TEMP environment variable, or in the current directory if neither of those
+exist.  Amiga implementations put the temp files in the directory named by
+JPEGTMP:, so be sure to assign JPEGTMP: to a disk partition with adequate free
+space.
+
+The default memory usage limit (-maxmemory) is set when the software is
+compiled.  If you get an "insufficient memory" error, try specifying a smaller
+-maxmemory value, even -maxmemory 0 to use the absolute minimum space.  You
+may want to recompile with a smaller default value if this happens often.
+
+On machines that have "environment" variables, you can define the environment
+variable JPEGMEM to set the default memory limit.  The value is specified as
+described for the -maxmemory switch.  JPEGMEM overrides the default value
+specified when the program was compiled, and itself is overridden by an
+explicit -maxmemory switch.
+
+On MS-DOS machines, -maxmemory is the amount of main (conventional) memory to
+use.  (Extended or expanded memory is also used if available.)  Most
+DOS-specific versions of this software do their own memory space estimation
+and do not need you to specify -maxmemory.
+
+
+JPEGTRAN
+
+jpegtran performs various useful transformations of JPEG files.
+It can translate the coded representation from one variant of JPEG to another,
+for example from baseline JPEG to progressive JPEG or vice versa.  It can also
+perform some rearrangements of the image data, for example turning an image
+from landscape to portrait format by rotation.
+
+jpegtran works by rearranging the compressed data (DCT coefficients), without
+ever fully decoding the image.  Therefore, its transformations are lossless:
+there is no image degradation at all, which would not be true if you used
+djpeg followed by cjpeg to accomplish the same conversion.  But by the same
+token, jpegtran cannot perform lossy operations such as changing the image
+quality.
+
+jpegtran uses a command line syntax similar to cjpeg or djpeg.
+On Unix-like systems, you say:
+	jpegtran [switches] [inputfile] >outputfile
+On most non-Unix systems, you say:
+	jpegtran [switches] inputfile outputfile
+where both the input and output files are JPEG files.
+
+To specify the coded JPEG representation used in the output file,
+jpegtran accepts a subset of the switches recognized by cjpeg:
+	-optimize	Perform optimization of entropy encoding parameters.
+	-progressive	Create progressive JPEG file.
+	-restart N	Emit a JPEG restart marker every N MCU rows, or every
+			N MCU blocks if "B" is attached to the number.
+	-arithmetic	Use arithmetic coding.
+	-scans file	Use the scan script given in the specified text file.
+See the previous discussion of cjpeg for more details about these switches.
+If you specify none of these switches, you get a plain baseline-JPEG output
+file.  The quality setting and so forth are determined by the input file.
+
+The image can be losslessly transformed by giving one of these switches:
+	-flip horizontal	Mirror image horizontally (left-right).
+	-flip vertical		Mirror image vertically (top-bottom).
+	-rotate 90		Rotate image 90 degrees clockwise.
+	-rotate 180		Rotate image 180 degrees.
+	-rotate 270		Rotate image 270 degrees clockwise (or 90 ccw).
+	-transpose		Transpose image (across UL-to-LR axis).
+	-transverse		Transverse transpose (across UR-to-LL axis).
+
+The transpose transformation has no restrictions regarding image dimensions.
+The other transformations operate rather oddly if the image dimensions are not
+a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
+transform complete blocks of DCT coefficient data in the desired way.
+
+jpegtran's default behavior when transforming an odd-size image is designed
+to preserve exact reversibility and mathematical consistency of the
+transformation set.  As stated, transpose is able to flip the entire image
+area.  Horizontal mirroring leaves any partial iMCU column at the right edge
+untouched, but is able to flip all rows of the image.  Similarly, vertical
+mirroring leaves any partial iMCU row at the bottom edge untouched, but is
+able to flip all columns.  The other transforms can be built up as sequences
+of transpose and flip operations; for consistency, their actions on edge
+pixels are defined to be the same as the end result of the corresponding
+transpose-and-flip sequence.
+
+For practical use, you may prefer to discard any untransformable edge pixels
+rather than having a strange-looking strip along the right and/or bottom edges
+of a transformed image.  To do this, add the -trim switch:
+	-trim		Drop non-transformable edge blocks.
+Obviously, a transformation with -trim is not reversible, so strictly speaking
+jpegtran with this switch is not lossless.  Also, the expected mathematical
+equivalences between the transformations no longer hold.  For example,
+"-rot 270 -trim" trims only the bottom edge, but "-rot 90 -trim" followed by
+"-rot 180 -trim" trims both edges.
+
+If you are only interested in perfect transformation, add the -perfect switch:
+	-perfect	Fails with an error if the transformation is not
+			perfect.
+For example you may want to do
+  jpegtran -rot 90 -perfect foo.jpg || djpeg foo.jpg | pnmflip -r90 | cjpeg
+to do a perfect rotation if available or an approximated one if not.
+
+We also offer a lossless-crop option, which discards data outside a given
+image region but losslessly preserves what is inside.  Like the rotate and
+flip transforms, lossless crop is restricted by the current JPEG format: the
+upper left corner of the selected region must fall on an iMCU boundary.  If
+this does not hold for the given crop parameters, we silently move the upper
+left corner up and/or left to make it so, simultaneously increasing the region
+dimensions to keep the lower right crop corner unchanged.  (Thus, the output
+image covers at least the requested region, but may cover more.)
+
+The image can be losslessly cropped by giving the switch:
+	-crop WxH+X+Y	Crop to a rectangular subarea of width W, height H
+			starting at point X,Y.
+
+Other not-strictly-lossless transformation switches are:
+
+	-grayscale	Force grayscale output.
+This option discards the chrominance channels if the input image is YCbCr
+(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
+luminance channel is preserved exactly, so this is a better method of reducing
+to grayscale than decompression, conversion, and recompression.  This switch
+is particularly handy for fixing a monochrome picture that was mistakenly
+encoded as a color JPEG.  (In such a case, the space savings from getting rid
+of the near-empty chroma channels won't be large; but the decoding time for
+a grayscale JPEG is substantially less than that for a color JPEG.)
+
+	-scale M/N	Scale the output image by a factor M/N.
+Currently supported scale factors are M/N with all M from 1 to 16, where N is
+the source DCT size, which is 8 for baseline JPEG.  If the /N part is omitted,
+then M specifies the DCT scaled size to be applied on the given input.  For
+baseline JPEG this is equivalent to M/8 scaling, since the source DCT size
+for baseline JPEG is 8.  CAUTION: An implementation of the JPEG SmartScale
+extension is required for this feature.  SmartScale enabled JPEG is not yet
+widely implemented, so many decoders will be unable to view a SmartScale
+extended JPEG file at all.
+
+jpegtran also recognizes these switches that control what to do with "extra"
+markers, such as comment blocks:
+	-copy none	Copy no extra markers from source file.  This setting
+			suppresses all comments and other excess baggage
+			present in the source file.
+	-copy comments	Copy only comment markers.  This setting copies
+			comments from the source file, but discards
+			any other inessential (for image display) data.
+	-copy all	Copy all extra markers.  This setting preserves
+			miscellaneous markers found in the source file, such
+			as JFIF thumbnails, Exif data, and Photoshop settings.
+			In some files these extra markers can be sizable.
+The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
+jpegtran always did the equivalent of -copy none.)
+
+Additional switches recognized by jpegtran are:
+	-outfile filename
+	-maxmemory N
+	-verbose
+	-debug
+These work the same as in cjpeg or djpeg.
+
+
+THE COMMENT UTILITIES
+
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+
+We provide two utility programs to display COM block contents and add COM
+blocks to a JPEG file.
+
+rdjpgcom searches a JPEG file and prints the contents of any COM blocks on
+standard output.  The command line syntax is
+	rdjpgcom [-raw] [-verbose] [inputfilename]
+The switch "-raw" (or just "-r") causes rdjpgcom to also output non-printable
+characters in comments, which are normally escaped for security reasons.
+The switch "-verbose" (or just "-v") causes rdjpgcom to also display the JPEG
+image dimensions.  If you omit the input file name from the command line,
+the JPEG file is read from standard input.  (This may not work on some
+operating systems, if binary data can't be read from stdin.)
+
+wrjpgcom adds a COM block, containing text you provide, to a JPEG file.
+Ordinarily, the COM block is added after any existing COM blocks, but you
+can delete the old COM blocks if you wish.  wrjpgcom produces a new JPEG
+file; it does not modify the input file.  DO NOT try to overwrite the input
+file by directing wrjpgcom's output back into it; on most systems this will
+just destroy your file.
+
+The command line syntax for wrjpgcom is similar to cjpeg's.  On Unix-like
+systems, it is
+	wrjpgcom [switches] [inputfilename]
+The output file is written to standard output.  The input file comes from
+the named file, or from standard input if no input file is named.
+
+On most non-Unix systems, the syntax is
+	wrjpgcom [switches] inputfilename outputfilename
+where both input and output file names must be given explicitly.
+
+wrjpgcom understands three switches:
+	-replace		 Delete any existing COM blocks from the file.
+	-comment "Comment text"	 Supply new COM text on command line.
+	-cfile name		 Read text for new COM block from named file.
+(Switch names can be abbreviated.)  If you have only one line of comment text
+to add, you can provide it on the command line with -comment.  The comment
+text must be surrounded with quotes so that it is treated as a single
+argument.  Longer comments can be read from a text file.
+
+If you give neither -comment nor -cfile, then wrjpgcom will read the comment
+text from standard input.  (In this case an input image file name MUST be
+supplied, so that the source JPEG file comes from somewhere else.)  You can
+enter multiple lines, up to 64KB worth.  Type an end-of-file indicator
+(usually control-D or control-Z) to terminate the comment text entry.
+
+wrjpgcom will not add a COM block if the provided comment string is empty.
+Therefore -replace -comment "" can be used to delete all COM blocks from a
+file.
+
+These utility programs do not depend on the IJG JPEG library.  In
+particular, the source code for rdjpgcom is intended as an illustration of
+the minimum amount of code required to parse a JPEG file header correctly.
diff --git a/jpeg/wizard.doc b/jpeg/wizard.doc
deleted file mode 100644
index 54170b227..000000000
--- a/jpeg/wizard.doc
+++ /dev/null
@@ -1,211 +0,0 @@
-Advanced usage instructions for the Independent JPEG Group's JPEG software
-==========================================================================
-
-This file describes cjpeg's "switches for wizards".
-
-The "wizard" switches are intended for experimentation with JPEG by persons
-who are reasonably knowledgeable about the JPEG standard.  If you don't know
-what you are doing, DON'T USE THESE SWITCHES.  You'll likely produce files
-with worse image quality and/or poorer compression than you'd get from the
-default settings.  Furthermore, these switches must be used with caution
-when making files intended for general use, because not all JPEG decoders
-will support unusual JPEG parameter settings.
-
-
-Quantization Table Adjustment
------------------------------
-
-Ordinarily, cjpeg starts with a default set of tables (the same ones given
-as examples in the JPEG standard) and scales them up or down according to
-the -quality setting.  The details of the scaling algorithm can be found in
-jcparam.c.  At very low quality settings, some quantization table entries
-can get scaled up to values exceeding 255.  Although 2-byte quantization
-values are supported by the IJG software, this feature is not in baseline
-JPEG and is not supported by all implementations.  If you need to ensure
-wide compatibility of low-quality files, you can constrain the scaled
-quantization values to no more than 255 by giving the -baseline switch.
-Note that use of -baseline will result in poorer quality for the same file
-size, since more bits than necessary are expended on higher AC coefficients.
-
-You can substitute a different set of quantization values by using the
--qtables switch:
-
-	-qtables file	Use the quantization tables given in the named file.
-
-The specified file should be a text file containing decimal quantization
-values.  The file should contain one to four tables, each of 64 elements.
-The tables are implicitly numbered 0,1,etc. in order of appearance.  Table
-entries appear in normal array order (NOT in the zigzag order in which they
-will be stored in the JPEG file).
-
-Quantization table files are free format, in that arbitrary whitespace can
-appear between numbers.  Also, comments can be included: a comment starts
-with '#' and extends to the end of the line.  Here is an example file that
-duplicates the default quantization tables:
-
-	# Quantization tables given in JPEG spec, section K.1
-
-	# This is table 0 (the luminance table):
-	  16  11  10  16  24  40  51  61
-	  12  12  14  19  26  58  60  55
-	  14  13  16  24  40  57  69  56
-	  14  17  22  29  51  87  80  62
-	  18  22  37  56  68 109 103  77
-	  24  35  55  64  81 104 113  92
-	  49  64  78  87 103 121 120 101
-	  72  92  95  98 112 100 103  99
-
-	# This is table 1 (the chrominance table):
-	  17  18  24  47  99  99  99  99
-	  18  21  26  66  99  99  99  99
-	  24  26  56  99  99  99  99  99
-	  47  66  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-
-If the -qtables switch is used without -quality, then the specified tables
-are used exactly as-is.  If both -qtables and -quality are used, then the
-tables taken from the file are scaled in the same fashion that the default
-tables would be scaled for that quality setting.  If -baseline appears, then
-the quantization values are constrained to the range 1-255.
-
-By default, cjpeg will use quantization table 0 for luminance components and
-table 1 for chrominance components.  To override this choice, use the -qslots
-switch:
-
-	-qslots N[,...]		Select which quantization table to use for
-				each color component.
-
-The -qslots switch specifies a quantization table number for each color
-component, in the order in which the components appear in the JPEG SOF marker.
-For example, to create a separate table for each of Y,Cb,Cr, you could
-provide a -qtables file that defines three quantization tables and say
-"-qslots 0,1,2".  If -qslots gives fewer table numbers than there are color
-components, then the last table number is repeated as necessary.
-
-
-Sampling Factor Adjustment
---------------------------
-
-By default, cjpeg uses 2:1 horizontal and vertical downsampling when
-compressing YCbCr data, and no downsampling for all other color spaces.
-You can override this default with the -sample switch:
-
-	-sample HxV[,...]	Set JPEG sampling factors for each color
-				component.
-
-The -sample switch specifies the JPEG sampling factors for each color
-component, in the order in which they appear in the JPEG SOF marker.
-If you specify fewer HxV pairs than there are components, the remaining
-components are set to 1x1 sampling.  For example, the default YCbCr setting
-is equivalent to "-sample 2x2,1x1,1x1", which can be abbreviated to
-"-sample 2x2".
-
-There are still some JPEG decoders in existence that support only 2x1
-sampling (also called 4:2:2 sampling).  Compatibility with such decoders can
-be achieved by specifying "-sample 2x1".  This is not recommended unless
-really necessary, since it increases file size and encoding/decoding time
-with very little quality gain.
-
-
-Multiple Scan / Progression Control
------------------------------------
-
-By default, cjpeg emits a single-scan sequential JPEG file.  The
--progressive switch generates a progressive JPEG file using a default series
-of progression parameters.  You can create multiple-scan sequential JPEG
-files or progressive JPEG files with custom progression parameters by using
-the -scans switch:
-
-	-scans file	Use the scan sequence given in the named file.
-
-The specified file should be a text file containing a "scan script".
-The script specifies the contents and ordering of the scans to be emitted.
-Each entry in the script defines one scan.  A scan definition specifies
-the components to be included in the scan, and for progressive JPEG it also
-specifies the progression parameters Ss,Se,Ah,Al for the scan.  Scan
-definitions are separated by semicolons (';').  A semicolon after the last
-scan definition is optional.
-
-Each scan definition contains one to four component indexes, optionally
-followed by a colon (':') and the four progressive-JPEG parameters.  The
-component indexes denote which color component(s) are to be transmitted in
-the scan.  Components are numbered in the order in which they appear in the
-JPEG SOF marker, with the first component being numbered 0.  (Note that these
-indexes are not the "component ID" codes assigned to the components, just
-positional indexes.)
-
-The progression parameters for each scan are:
-	Ss	Zigzag index of first coefficient included in scan
-	Se	Zigzag index of last coefficient included in scan
-	Ah	Zero for first scan of a coefficient, else Al of prior scan
-	Al	Successive approximation low bit position for scan
-If the progression parameters are omitted, the values 0,63,0,0 are used,
-producing a sequential JPEG file.  cjpeg automatically determines whether
-the script represents a progressive or sequential file, by observing whether
-Ss and Se values other than 0 and 63 appear.  (The -progressive switch is
-not needed to specify this; in fact, it is ignored when -scans appears.)
-The scan script must meet the JPEG restrictions on progression sequences.
-(cjpeg checks that the spec's requirements are obeyed.)
-
-Scan script files are free format, in that arbitrary whitespace can appear
-between numbers and around punctuation.  Also, comments can be included: a
-comment starts with '#' and extends to the end of the line.  For additional
-legibility, commas or dashes can be placed between values.  (Actually, any
-single punctuation character other than ':' or ';' can be inserted.)  For
-example, the following two scan definitions are equivalent:
-	0 1 2: 0 63 0 0;
-	0,1,2 : 0-63, 0,0 ;
-
-Here is an example of a scan script that generates a partially interleaved
-sequential JPEG file:
-
-	0;			# Y only in first scan
-	1 2;			# Cb and Cr in second scan
-
-Here is an example of a progressive scan script using only spectral selection
-(no successive approximation):
-
-	# Interleaved DC scan for Y,Cb,Cr:
-	0,1,2: 0-0,   0, 0 ;
-	# AC scans:
-	0:     1-2,   0, 0 ;	# First two Y AC coefficients
-	0:     3-5,   0, 0 ;	# Three more
-	1:     1-63,  0, 0 ;	# All AC coefficients for Cb
-	2:     1-63,  0, 0 ;	# All AC coefficients for Cr
-	0:     6-9,   0, 0 ;	# More Y coefficients
-	0:     10-63, 0, 0 ;	# Remaining Y coefficients
-
-Here is an example of a successive-approximation script.  This is equivalent
-to the default script used by "cjpeg -progressive" for YCbCr images:
-
-	# Initial DC scan for Y,Cb,Cr (lowest bit not sent)
-	0,1,2: 0-0,   0, 1 ;
-	# First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits:
-	0:     1-5,   0, 2 ;
-	# Send all Cr,Cb AC coefficients, minus lowest bit:
-	# (chroma data is usually too small to be worth subdividing further;
-	#  but note we send Cr first since eye is least sensitive to Cb)
-	2:     1-63,  0, 1 ;
-	1:     1-63,  0, 1 ;
-	# Send remaining Y AC coefficients, minus 2 lowest bits:
-	0:     6-63,  0, 2 ;
-	# Send next-to-lowest bit of all Y AC coefficients:
-	0:     1-63,  2, 1 ;
-	# At this point we've sent all but the lowest bit of all coefficients.
-	# Send lowest bit of DC coefficients
-	0,1,2: 0-0,   1, 0 ;
-	# Send lowest bit of AC coefficients
-	2:     1-63,  1, 0 ;
-	1:     1-63,  1, 0 ;
-	# Y AC lowest bit scan is last; it's usually the largest scan
-	0:     1-63,  1, 0 ;
-
-It may be worth pointing out that this script is tuned for quality settings
-of around 50 to 75.  For lower quality settings, you'd probably want to use
-a script with fewer stages of successive approximation (otherwise the
-initial scans will be really bad).  For higher quality settings, you might
-want to use more stages of successive approximation (so that the initial
-scans are not too large).
diff --git a/jpeg/wizard.txt b/jpeg/wizard.txt
new file mode 100644
index 000000000..54170b227
--- /dev/null
+++ b/jpeg/wizard.txt
@@ -0,0 +1,211 @@
+Advanced usage instructions for the Independent JPEG Group's JPEG software
+==========================================================================
+
+This file describes cjpeg's "switches for wizards".
+
+The "wizard" switches are intended for experimentation with JPEG by persons
+who are reasonably knowledgeable about the JPEG standard.  If you don't know
+what you are doing, DON'T USE THESE SWITCHES.  You'll likely produce files
+with worse image quality and/or poorer compression than you'd get from the
+default settings.  Furthermore, these switches must be used with caution
+when making files intended for general use, because not all JPEG decoders
+will support unusual JPEG parameter settings.
+
+
+Quantization Table Adjustment
+-----------------------------
+
+Ordinarily, cjpeg starts with a default set of tables (the same ones given
+as examples in the JPEG standard) and scales them up or down according to
+the -quality setting.  The details of the scaling algorithm can be found in
+jcparam.c.  At very low quality settings, some quantization table entries
+can get scaled up to values exceeding 255.  Although 2-byte quantization
+values are supported by the IJG software, this feature is not in baseline
+JPEG and is not supported by all implementations.  If you need to ensure
+wide compatibility of low-quality files, you can constrain the scaled
+quantization values to no more than 255 by giving the -baseline switch.
+Note that use of -baseline will result in poorer quality for the same file
+size, since more bits than necessary are expended on higher AC coefficients.
+
+You can substitute a different set of quantization values by using the
+-qtables switch:
+
+	-qtables file	Use the quantization tables given in the named file.
+
+The specified file should be a text file containing decimal quantization
+values.  The file should contain one to four tables, each of 64 elements.
+The tables are implicitly numbered 0,1,etc. in order of appearance.  Table
+entries appear in normal array order (NOT in the zigzag order in which they
+will be stored in the JPEG file).
+
+Quantization table files are free format, in that arbitrary whitespace can
+appear between numbers.  Also, comments can be included: a comment starts
+with '#' and extends to the end of the line.  Here is an example file that
+duplicates the default quantization tables:
+
+	# Quantization tables given in JPEG spec, section K.1
+
+	# This is table 0 (the luminance table):
+	  16  11  10  16  24  40  51  61
+	  12  12  14  19  26  58  60  55
+	  14  13  16  24  40  57  69  56
+	  14  17  22  29  51  87  80  62
+	  18  22  37  56  68 109 103  77
+	  24  35  55  64  81 104 113  92
+	  49  64  78  87 103 121 120 101
+	  72  92  95  98 112 100 103  99
+
+	# This is table 1 (the chrominance table):
+	  17  18  24  47  99  99  99  99
+	  18  21  26  66  99  99  99  99
+	  24  26  56  99  99  99  99  99
+	  47  66  99  99  99  99  99  99
+	  99  99  99  99  99  99  99  99
+	  99  99  99  99  99  99  99  99
+	  99  99  99  99  99  99  99  99
+	  99  99  99  99  99  99  99  99
+
+If the -qtables switch is used without -quality, then the specified tables
+are used exactly as-is.  If both -qtables and -quality are used, then the
+tables taken from the file are scaled in the same fashion that the default
+tables would be scaled for that quality setting.  If -baseline appears, then
+the quantization values are constrained to the range 1-255.
+
+By default, cjpeg will use quantization table 0 for luminance components and
+table 1 for chrominance components.  To override this choice, use the -qslots
+switch:
+
+	-qslots N[,...]		Select which quantization table to use for
+				each color component.
+
+The -qslots switch specifies a quantization table number for each color
+component, in the order in which the components appear in the JPEG SOF marker.
+For example, to create a separate table for each of Y,Cb,Cr, you could
+provide a -qtables file that defines three quantization tables and say
+"-qslots 0,1,2".  If -qslots gives fewer table numbers than there are color
+components, then the last table number is repeated as necessary.
+
+
+Sampling Factor Adjustment
+--------------------------
+
+By default, cjpeg uses 2:1 horizontal and vertical downsampling when
+compressing YCbCr data, and no downsampling for all other color spaces.
+You can override this default with the -sample switch:
+
+	-sample HxV[,...]	Set JPEG sampling factors for each color
+				component.
+
+The -sample switch specifies the JPEG sampling factors for each color
+component, in the order in which they appear in the JPEG SOF marker.
+If you specify fewer HxV pairs than there are components, the remaining
+components are set to 1x1 sampling.  For example, the default YCbCr setting
+is equivalent to "-sample 2x2,1x1,1x1", which can be abbreviated to
+"-sample 2x2".
+
+There are still some JPEG decoders in existence that support only 2x1
+sampling (also called 4:2:2 sampling).  Compatibility with such decoders can
+be achieved by specifying "-sample 2x1".  This is not recommended unless
+really necessary, since it increases file size and encoding/decoding time
+with very little quality gain.
+
+
+Multiple Scan / Progression Control
+-----------------------------------
+
+By default, cjpeg emits a single-scan sequential JPEG file.  The
+-progressive switch generates a progressive JPEG file using a default series
+of progression parameters.  You can create multiple-scan sequential JPEG
+files or progressive JPEG files with custom progression parameters by using
+the -scans switch:
+
+	-scans file	Use the scan sequence given in the named file.
+
+The specified file should be a text file containing a "scan script".
+The script specifies the contents and ordering of the scans to be emitted.
+Each entry in the script defines one scan.  A scan definition specifies
+the components to be included in the scan, and for progressive JPEG it also
+specifies the progression parameters Ss,Se,Ah,Al for the scan.  Scan
+definitions are separated by semicolons (';').  A semicolon after the last
+scan definition is optional.
+
+Each scan definition contains one to four component indexes, optionally
+followed by a colon (':') and the four progressive-JPEG parameters.  The
+component indexes denote which color component(s) are to be transmitted in
+the scan.  Components are numbered in the order in which they appear in the
+JPEG SOF marker, with the first component being numbered 0.  (Note that these
+indexes are not the "component ID" codes assigned to the components, just
+positional indexes.)
+
+The progression parameters for each scan are:
+	Ss	Zigzag index of first coefficient included in scan
+	Se	Zigzag index of last coefficient included in scan
+	Ah	Zero for first scan of a coefficient, else Al of prior scan
+	Al	Successive approximation low bit position for scan
+If the progression parameters are omitted, the values 0,63,0,0 are used,
+producing a sequential JPEG file.  cjpeg automatically determines whether
+the script represents a progressive or sequential file, by observing whether
+Ss and Se values other than 0 and 63 appear.  (The -progressive switch is
+not needed to specify this; in fact, it is ignored when -scans appears.)
+The scan script must meet the JPEG restrictions on progression sequences.
+(cjpeg checks that the spec's requirements are obeyed.)
+
+Scan script files are free format, in that arbitrary whitespace can appear
+between numbers and around punctuation.  Also, comments can be included: a
+comment starts with '#' and extends to the end of the line.  For additional
+legibility, commas or dashes can be placed between values.  (Actually, any
+single punctuation character other than ':' or ';' can be inserted.)  For
+example, the following two scan definitions are equivalent:
+	0 1 2: 0 63 0 0;
+	0,1,2 : 0-63, 0,0 ;
+
+Here is an example of a scan script that generates a partially interleaved
+sequential JPEG file:
+
+	0;			# Y only in first scan
+	1 2;			# Cb and Cr in second scan
+
+Here is an example of a progressive scan script using only spectral selection
+(no successive approximation):
+
+	# Interleaved DC scan for Y,Cb,Cr:
+	0,1,2: 0-0,   0, 0 ;
+	# AC scans:
+	0:     1-2,   0, 0 ;	# First two Y AC coefficients
+	0:     3-5,   0, 0 ;	# Three more
+	1:     1-63,  0, 0 ;	# All AC coefficients for Cb
+	2:     1-63,  0, 0 ;	# All AC coefficients for Cr
+	0:     6-9,   0, 0 ;	# More Y coefficients
+	0:     10-63, 0, 0 ;	# Remaining Y coefficients
+
+Here is an example of a successive-approximation script.  This is equivalent
+to the default script used by "cjpeg -progressive" for YCbCr images:
+
+	# Initial DC scan for Y,Cb,Cr (lowest bit not sent)
+	0,1,2: 0-0,   0, 1 ;
+	# First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits:
+	0:     1-5,   0, 2 ;
+	# Send all Cr,Cb AC coefficients, minus lowest bit:
+	# (chroma data is usually too small to be worth subdividing further;
+	#  but note we send Cr first since eye is least sensitive to Cb)
+	2:     1-63,  0, 1 ;
+	1:     1-63,  0, 1 ;
+	# Send remaining Y AC coefficients, minus 2 lowest bits:
+	0:     6-63,  0, 2 ;
+	# Send next-to-lowest bit of all Y AC coefficients:
+	0:     1-63,  2, 1 ;
+	# At this point we've sent all but the lowest bit of all coefficients.
+	# Send lowest bit of DC coefficients
+	0,1,2: 0-0,   1, 0 ;
+	# Send lowest bit of AC coefficients
+	2:     1-63,  1, 0 ;
+	1:     1-63,  1, 0 ;
+	# Y AC lowest bit scan is last; it's usually the largest scan
+	0:     1-63,  1, 0 ;
+
+It may be worth pointing out that this script is tuned for quality settings
+of around 50 to 75.  For lower quality settings, you'd probably want to use
+a script with fewer stages of successive approximation (otherwise the
+initial scans will be really bad).  For higher quality settings, you might
+want to use more stages of successive approximation (so that the initial
+scans are not too large).
-- 
cgit v1.2.3