Back to index

plt-scheme  4.2.1
pnggccrd.c
Go to the documentation of this file.
00001 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
00002  *
00003  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
00004  *
00005  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
00006  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
00007  *     for Intel's performance analysis of the MMX vs. non-MMX code.
00008  *
00009  * libpng version 1.2.5 - October 3, 2002
00010  * For conditions of distribution and use, see copyright notice in png.h
00011  * Copyright (c) 1998-2002 Glenn Randers-Pehrson
00012  * Copyright (c) 1998, Intel Corporation
00013  *
00014  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
00015  * Interface to libpng contributed by Gilles Vollant, 1999.
00016  * GNU C port by Greg Roelofs, 1999-2001.
00017  *
00018  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
00019  *
00020  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
00021  *
00022  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
00023  *
00024  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
00025  *        is required to assemble the newer MMX instructions such as movq.
00026  *        For djgpp, see
00027  *
00028  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
00029  *
00030  *        (or a later version in the same directory).  For Linux, check your
00031  *        distribution's web site(s) or try these links:
00032  *
00033  *           http://rufus.w3.org/linux/RPM/binutils.html
00034  *           http://www.debian.org/Packages/stable/devel/binutils.html
00035  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
00036  *             binutils.tgz
00037  *
00038  *        For other platforms, see the main GNU site:
00039  *
00040  *           ftp://ftp.gnu.org/pub/gnu/binutils/
00041  *
00042  *        Version 2.5.2l.15 is definitely too old...
00043  */
00044 
00045 /*
00046  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
00047  * =====================================
00048  *
00049  * 19991006:
00050  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
00051  *
00052  * 19991007:
00053  *  - additional optimizations (possible or definite):
00054  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
00055  *     - write MMX code for 48-bit case (pixel_bytes == 6)
00056  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
00057  *        why subtract 8 from width_mmx in the pass 4/5 case?
00058  *        (only width_mmx case) (near line 1606)
00059  *     x [DONE] replace pixel_bytes within each block with the true
00060  *        constant value (or are compilers smart enough to do that?)
00061  *     - rewrite all MMX interlacing code so it's aligned with
00062  *        the *beginning* of the row buffer, not the end.  This
00063  *        would not only allow one to eliminate half of the memory
00064  *        writes for odd passes (that is, pass == odd), it may also
00065  *        eliminate some unaligned-data-access exceptions (assuming
00066  *        there's a penalty for not aligning 64-bit accesses on
00067  *        64-bit boundaries).  The only catch is that the "leftover"
00068  *        pixel(s) at the end of the row would have to be saved,
00069  *        but there are enough unused MMX registers in every case,
00070  *        so this is not a problem.  A further benefit is that the
00071  *        post-MMX cleanup code (C code) in at least some of the
00072  *        cases could be done within the assembler block.
00073  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
00074  *     inconsistent, and don't match the MMX Programmer's Reference
00075  *     Manual conventions anyway.  They should be changed to
00076  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
00077  *     was lowest in memory (e.g., corresponding to a left pixel)
00078  *     and b7 is the byte that was highest (e.g., a right pixel).
00079  *
00080  * 19991016:
00081  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
00082  *     want globals prefixed by underscores when referencing them--
00083  *     i.e., if the variable is const4, then refer to it as const4,
00084  *     not _const4.  This seems to be a djgpp-specific requirement.
00085  *     Also, such variables apparently *must* be declared outside
00086  *     of functions; neither static nor automatic variables work if
00087  *     defined within the scope of a single function, but both
00088  *     static and truly global (multi-module) variables work fine.
00089  *
00090  * 19991023:
00091  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
00092  *  - switched from string-concatenation-with-macros to cleaner method of
00093  *     renaming global variables for djgpp--i.e., always use prefixes in
00094  *     inlined assembler code (== strings) and conditionally rename the
00095  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
00096  *
00097  * 19991024:
00098  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
00099  *     This one was severely weird:  even though mmxsupport() doesn't touch
00100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
00101  *     the register (even in static/non-fPIC code--see below), which in turn
00102  *     caused png_do_read_interlace() to return prematurely on the first row of
00103  *     interlaced images (i.e., without expanding the interlaced pixels).
00104  *     Inspection of the generated assembly code didn't turn up any clues,
00105  *     although it did point at a minor optimization (i.e., get rid of
00106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
00107  *     instruction is more destructive than it looks?  (Not yet checked.)
00108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
00109  *     listings...  Apparently register spillage has to do with ebx, since
00110  *     it's used to index the global offset table.  Commenting it out of the
00111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
00112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
00113  *
00114  * 19991107:
00115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
00116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
00117  *
00118  * 19991120:
00119  *  - made "diff" variable (now "_dif") global to simplify conversion of
00120  *     filtering routines (running out of regs, sigh).  "diff" is still used
00121  *     in interlacing routines, however.
00122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
00123  *     macro determines which is used); original not yet tested.
00124  *
00125  * 20000213:
00126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
00127  *
00128  * 20000319:
00129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
00130  *     pass == 4 or 5, that caused visible corruption of interlaced images
00131  *
00132  * 20000623:
00133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
00134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
00135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
00136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
00137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
00138  *     for the original (anonymous) SourceForge bug report.
00139  *
00140  * 20000706:
00141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
00142  *       pnggccrd.c: In function `png_combine_row':
00143  *       pnggccrd.c:525: more than 10 operands in `asm'
00144  *       pnggccrd.c:669: more than 10 operands in `asm'
00145  *       pnggccrd.c:828: more than 10 operands in `asm'
00146  *       pnggccrd.c:994: more than 10 operands in `asm'
00147  *       pnggccrd.c:1177: more than 10 operands in `asm'
00148  *     They are all the same problem and can be worked around by using the
00149  *     global _unmask variable unconditionally, not just in the -fPIC case.
00150  *     Reportedly earlier versions of gcc also have the problem with more than
00151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
00152  *
00153  * 20000729:
00154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
00155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
00156  *  - to finish remaining sections:
00157  *     - clean up indentation and comments
00158  *     - preload local variables
00159  *     - add output and input regs (order of former determines numerical
00160  *        mapping of latter)
00161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
00162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
00163  *
00164  * 20000731:
00165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
00166  *
00167  * 20000822:
00168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
00169  *     shared-library (-fPIC) version!  Code works just fine as part of static
00170  *     library.  Damn damn damn damn damn, should have tested that sooner.
00171  *     ebx is getting clobbered again (explicitly this time); need to save it
00172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
00173  *
00174  * 20000823:
00175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
00176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
00177  *     and *Mask* globals and got rid of leading "$" signs.
00178  *
00179  * 20000826:
00180  *  - added visual separators to help navigate microscopic printed copies
00181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
00182  *     on png_read_filter_row_mmx_avg()
00183  *
00184  * 20000828:
00185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
00186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
00187  *     cleaned up/shortened in either routine, but functionality is complete
00188  *     and seems to be working fine.
00189  *
00190  * 20000829:
00191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
00192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
00193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
00194  *     is simple enough...
00195  *
00196  * 20000914:
00197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
00198  *     correctly (but 48-bit RGB just fine)
00199  *
00200  * 20000916:
00201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
00202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
00203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
00204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
00205  *
00206  * 20010101:
00207  *  - added new png_init_mmx_flags() function (here only because it needs to
00208  *     call mmxsupport(), which should probably become global png_mmxsupport());
00209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
00210  *
00211  * 20010103:
00212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
00213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
00214  *
00215  * 20010104:
00216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
00217  *     within MMX version of png_read_filter_row()) so no longer necessary to
00218  *     compile it into pngrutil.o
00219  *
00220  * 20010310:
00221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
00222  *
00223  * 20020304:
00224  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
00225  *
00226  * STILL TO DO:
00227  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
00228  *     - write MMX code for 48-bit case (pixel_bytes == 6)
00229  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
00230  *        why subtract 8 from width_mmx in the pass 4/5 case?
00231  *        (only width_mmx case) (near line 1606)
00232  *     - rewrite all MMX interlacing code so it's aligned with beginning
00233  *        of the row buffer, not the end (see 19991007 for details)
00234  *     x pick one version of mmxsupport() and get rid of the other
00235  *     - add error messages to any remaining bogus default cases
00236  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
00237  *     x add support for runtime enable/disable/query of various MMX routines
00238  */
00239 
00240 #define PNG_INTERNAL
00241 #include "png.h"
00242 
00243 #if defined(PNG_USE_PNGGCCRD)
00244 
00245 int PNGAPI png_mmx_support(void);
00246 
00247 #ifdef PNG_USE_LOCAL_ARRAYS
00248 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
00249 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
00250 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
00251 #endif
00252 
00253 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
00254 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
00255  * so define them without: */
00256 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
00257 #  define _mmx_supported  mmx_supported
00258 #  define _const4         const4
00259 #  define _const6         const6
00260 #  define _mask8_0        mask8_0
00261 #  define _mask16_1       mask16_1
00262 #  define _mask16_0       mask16_0
00263 #  define _mask24_2       mask24_2
00264 #  define _mask24_1       mask24_1
00265 #  define _mask24_0       mask24_0
00266 #  define _mask32_3       mask32_3
00267 #  define _mask32_2       mask32_2
00268 #  define _mask32_1       mask32_1
00269 #  define _mask32_0       mask32_0
00270 #  define _mask48_5       mask48_5
00271 #  define _mask48_4       mask48_4
00272 #  define _mask48_3       mask48_3
00273 #  define _mask48_2       mask48_2
00274 #  define _mask48_1       mask48_1
00275 #  define _mask48_0       mask48_0
00276 #  define _LBCarryMask    LBCarryMask
00277 #  define _HBClearMask    HBClearMask
00278 #  define _ActiveMask     ActiveMask
00279 #  define _ActiveMask2    ActiveMask2
00280 #  define _ActiveMaskEnd  ActiveMaskEnd
00281 #  define _ShiftBpp       ShiftBpp
00282 #  define _ShiftRem       ShiftRem
00283 #ifdef PNG_THREAD_UNSAFE_OK
00284 #  define _unmask         unmask
00285 #  define _FullLength     FullLength
00286 #  define _MMXLength      MMXLength
00287 #  define _dif            dif
00288 #  define _patemp         patemp
00289 #  define _pbtemp         pbtemp
00290 #  define _pctemp         pctemp
00291 #endif
00292 #endif
00293 
00294 
00295 /* These constants are used in the inlined MMX assembly code.
00296    Ignore gcc's "At top level: defined but not used" warnings. */
00297 
00298 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
00299  *  since that case uses the %ebx register for indexing the Global Offset Table
00300  *  and there were no other registers available.  But gcc 2.95 and later emit
00301  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
00302  *  in the non-PIC case, so we'll just use the global unconditionally now.
00303  */
00304 #ifdef PNG_THREAD_UNSAFE_OK
00305 static int _unmask;
00306 #endif
00307 
00308 static unsigned long long _mask8_0  = 0x0102040810204080LL;
00309 
00310 static unsigned long long _mask16_1 = 0x0101020204040808LL;
00311 static unsigned long long _mask16_0 = 0x1010202040408080LL;
00312 
00313 static unsigned long long _mask24_2 = 0x0101010202020404LL;
00314 static unsigned long long _mask24_1 = 0x0408080810101020LL;
00315 static unsigned long long _mask24_0 = 0x2020404040808080LL;
00316 
00317 static unsigned long long _mask32_3 = 0x0101010102020202LL;
00318 static unsigned long long _mask32_2 = 0x0404040408080808LL;
00319 static unsigned long long _mask32_1 = 0x1010101020202020LL;
00320 static unsigned long long _mask32_0 = 0x4040404080808080LL;
00321 
00322 static unsigned long long _mask48_5 = 0x0101010101010202LL;
00323 static unsigned long long _mask48_4 = 0x0202020204040404LL;
00324 static unsigned long long _mask48_3 = 0x0404080808080808LL;
00325 static unsigned long long _mask48_2 = 0x1010101010102020LL;
00326 static unsigned long long _mask48_1 = 0x2020202040404040LL;
00327 static unsigned long long _mask48_0 = 0x4040808080808080LL;
00328 
00329 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
00330 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
00331 static unsigned long long _const6   = 0x00000000000000FFLL;
00332 
00333 // These are used in the row-filter routines and should/would be local
00334 //  variables if not for gcc addressing limitations.
00335 // WARNING: Their presence probably defeats the thread safety of libpng.
00336 
00337 #ifdef PNG_THREAD_UNSAFE_OK
00338 static png_uint_32  _FullLength;
00339 static png_uint_32  _MMXLength;
00340 static int          _dif;
00341 static int          _patemp; // temp variables for Paeth routine
00342 static int          _pbtemp;
00343 static int          _pctemp;
00344 #endif
00345 
00346 void /* PRIVATE */
00347 png_squelch_warnings(void)
00348 {
00349 #ifdef PNG_THREAD_UNSAFE_OK
00350    _dif = _dif;
00351    _patemp = _patemp;
00352    _pbtemp = _pbtemp;
00353    _pctemp = _pctemp;
00354    _MMXLength = _MMXLength;
00355 #endif
00356    _const4  = _const4;
00357    _const6  = _const6;
00358    _mask8_0  = _mask8_0;
00359    _mask16_1 = _mask16_1;
00360    _mask16_0 = _mask16_0;
00361    _mask24_2 = _mask24_2;
00362    _mask24_1 = _mask24_1;
00363    _mask24_0 = _mask24_0;
00364    _mask32_3 = _mask32_3;
00365    _mask32_2 = _mask32_2;
00366    _mask32_1 = _mask32_1;
00367    _mask32_0 = _mask32_0;
00368    _mask48_5 = _mask48_5;
00369    _mask48_4 = _mask48_4;
00370    _mask48_3 = _mask48_3;
00371    _mask48_2 = _mask48_2;
00372    _mask48_1 = _mask48_1;
00373    _mask48_0 = _mask48_0;
00374 }
00375 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00376 
00377 
00378 static int _mmx_supported = 2;
00379 
00380 /*===========================================================================*/
00381 /*                                                                           */
00382 /*                       P N G _ C O M B I N E _ R O W                       */
00383 /*                                                                           */
00384 /*===========================================================================*/
00385 
00386 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
00387 
00388 #define BPP2  2
00389 #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
00390 #define BPP4  4
00391 #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
00392 #define BPP8  8
00393 
00394 /* Combines the row recently read in with the previous row.
00395    This routine takes care of alpha and transparency if requested.
00396    This routine also handles the two methods of progressive display
00397    of interlaced images, depending on the mask value.
00398    The mask value describes which pixels are to be combined with
00399    the row.  The pattern always repeats every 8 pixels, so just 8
00400    bits are needed.  A one indicates the pixel is to be combined; a
00401    zero indicates the pixel is to be skipped.  This is in addition
00402    to any alpha or transparency value associated with the pixel.
00403    If you want all pixels to be combined, pass 0xff (255) in mask. */
00404 
00405 /* Use this routine for the x86 platform - it uses a faster MMX routine
00406    if the machine supports MMX. */
00407 
00408 void /* PRIVATE */
00409 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
00410 {
00411    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
00412 
00413 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
00414    if (_mmx_supported == 2) {
00415        /* this should have happened in png_init_mmx_flags() already */
00416        png_warning(png_ptr, "asm_flags may not have been initialized");
00417        png_mmx_support();
00418    }
00419 #endif
00420 
00421    if (mask == 0xff)
00422    {
00423       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
00424       png_memcpy(row, png_ptr->row_buf + 1,
00425        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
00426    }
00427    else   /* (png_combine_row() is never called with mask == 0) */
00428    {
00429       switch (png_ptr->row_info.pixel_depth)
00430       {
00431          case 1:        /* png_ptr->row_info.pixel_depth */
00432          {
00433             png_bytep sp;
00434             png_bytep dp;
00435             int s_inc, s_start, s_end;
00436             int m;
00437             int shift;
00438             png_uint_32 i;
00439 
00440             sp = png_ptr->row_buf + 1;
00441             dp = row;
00442             m = 0x80;
00443 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00444             if (png_ptr->transformations & PNG_PACKSWAP)
00445             {
00446                 s_start = 0;
00447                 s_end = 7;
00448                 s_inc = 1;
00449             }
00450             else
00451 #endif
00452             {
00453                 s_start = 7;
00454                 s_end = 0;
00455                 s_inc = -1;
00456             }
00457 
00458             shift = s_start;
00459 
00460             for (i = 0; i < png_ptr->width; i++)
00461             {
00462                if (m & mask)
00463                {
00464                   int value;
00465 
00466                   value = (*sp >> shift) & 0x1;
00467                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
00468                   *dp |= (png_byte)(value << shift);
00469                }
00470 
00471                if (shift == s_end)
00472                {
00473                   shift = s_start;
00474                   sp++;
00475                   dp++;
00476                }
00477                else
00478                   shift += s_inc;
00479 
00480                if (m == 1)
00481                   m = 0x80;
00482                else
00483                   m >>= 1;
00484             }
00485             break;
00486          }
00487 
00488          case 2:        /* png_ptr->row_info.pixel_depth */
00489          {
00490             png_bytep sp;
00491             png_bytep dp;
00492             int s_start, s_end, s_inc;
00493             int m;
00494             int shift;
00495             png_uint_32 i;
00496             int value;
00497 
00498             sp = png_ptr->row_buf + 1;
00499             dp = row;
00500             m = 0x80;
00501 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00502             if (png_ptr->transformations & PNG_PACKSWAP)
00503             {
00504                s_start = 0;
00505                s_end = 6;
00506                s_inc = 2;
00507             }
00508             else
00509 #endif
00510             {
00511                s_start = 6;
00512                s_end = 0;
00513                s_inc = -2;
00514             }
00515 
00516             shift = s_start;
00517 
00518             for (i = 0; i < png_ptr->width; i++)
00519             {
00520                if (m & mask)
00521                {
00522                   value = (*sp >> shift) & 0x3;
00523                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
00524                   *dp |= (png_byte)(value << shift);
00525                }
00526 
00527                if (shift == s_end)
00528                {
00529                   shift = s_start;
00530                   sp++;
00531                   dp++;
00532                }
00533                else
00534                   shift += s_inc;
00535                if (m == 1)
00536                   m = 0x80;
00537                else
00538                   m >>= 1;
00539             }
00540             break;
00541          }
00542 
00543          case 4:        /* png_ptr->row_info.pixel_depth */
00544          {
00545             png_bytep sp;
00546             png_bytep dp;
00547             int s_start, s_end, s_inc;
00548             int m;
00549             int shift;
00550             png_uint_32 i;
00551             int value;
00552 
00553             sp = png_ptr->row_buf + 1;
00554             dp = row;
00555             m = 0x80;
00556 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00557             if (png_ptr->transformations & PNG_PACKSWAP)
00558             {
00559                s_start = 0;
00560                s_end = 4;
00561                s_inc = 4;
00562             }
00563             else
00564 #endif
00565             {
00566                s_start = 4;
00567                s_end = 0;
00568                s_inc = -4;
00569             }
00570             shift = s_start;
00571 
00572             for (i = 0; i < png_ptr->width; i++)
00573             {
00574                if (m & mask)
00575                {
00576                   value = (*sp >> shift) & 0xf;
00577                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
00578                   *dp |= (png_byte)(value << shift);
00579                }
00580 
00581                if (shift == s_end)
00582                {
00583                   shift = s_start;
00584                   sp++;
00585                   dp++;
00586                }
00587                else
00588                   shift += s_inc;
00589                if (m == 1)
00590                   m = 0x80;
00591                else
00592                   m >>= 1;
00593             }
00594             break;
00595          }
00596 
00597          case 8:        /* png_ptr->row_info.pixel_depth */
00598          {
00599             png_bytep srcptr;
00600             png_bytep dstptr;
00601 
00602 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00603 #if !defined(PNG_1_0_X)
00604             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00605                 /* && _mmx_supported */ )
00606 #else
00607             if (_mmx_supported)
00608 #endif
00609             {
00610                png_uint_32 len;
00611                int diff;
00612                int dummy_value_a;   // fix 'forbidden register spilled' error
00613                int dummy_value_d;
00614                int dummy_value_c;
00615                int dummy_value_S;
00616                int dummy_value_D;
00617                _unmask = ~mask;            // global variable for -fPIC version
00618                srcptr = png_ptr->row_buf + 1;
00619                dstptr = row;
00620                len  = png_ptr->width &~7;  // reduce to multiple of 8
00621                diff = (int) (png_ptr->width & 7);  // amount lost
00622 
00623                __asm__ __volatile__ (
00624                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
00625                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
00626                   "punpcklbw %%mm7, %%mm7    \n\t"
00627                   "punpcklwd %%mm7, %%mm7    \n\t"
00628                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
00629 
00630                   "movq      _mask8_0, %%mm0 \n\t"
00631                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
00632                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
00633 
00634 // preload        "movl      len, %%ecx      \n\t" // load length of line
00635 // preload        "movl      srcptr, %%esi   \n\t" // load source
00636 // preload        "movl      dstptr, %%edi   \n\t" // load dest
00637 
00638                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
00639                   "je        mainloop8end    \n\t"
00640 
00641                 "mainloop8:                  \n\t"
00642                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
00643                   "pand      %%mm0, %%mm4    \n\t"
00644                   "movq      %%mm0, %%mm6    \n\t"
00645                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
00646                   "por       %%mm6, %%mm4    \n\t"
00647                   "movq      %%mm4, (%%edi)  \n\t"
00648                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
00649                   "addl      $8, %%edi       \n\t"
00650                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
00651                   "ja        mainloop8       \n\t"
00652 
00653                 "mainloop8end:               \n\t"
00654 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
00655                   "movl      %%eax, %%ecx    \n\t"
00656                   "cmpl      $0, %%ecx       \n\t"
00657                   "jz        end8            \n\t"
00658 // preload        "movl      mask, %%edx     \n\t"
00659                   "sall      $24, %%edx      \n\t" // make low byte, high byte
00660 
00661                 "secondloop8:                \n\t"
00662                   "sall      %%edx           \n\t" // move high bit to CF
00663                   "jnc       skip8           \n\t" // if CF = 0
00664                   "movb      (%%esi), %%al   \n\t"
00665                   "movb      %%al, (%%edi)   \n\t"
00666 
00667                 "skip8:                      \n\t"
00668                   "incl      %%esi           \n\t"
00669                   "incl      %%edi           \n\t"
00670                   "decl      %%ecx           \n\t"
00671                   "jnz       secondloop8     \n\t"
00672 
00673                 "end8:                       \n\t"
00674                   "EMMS                      \n\t"  // DONE
00675 
00676                   : "=a" (dummy_value_a),           // output regs (dummy)
00677                     "=d" (dummy_value_d),
00678                     "=c" (dummy_value_c),
00679                     "=S" (dummy_value_S),
00680                     "=D" (dummy_value_D)
00681 
00682                   : "3" (srcptr),      // esi       // input regs
00683                     "4" (dstptr),      // edi
00684                     "0" (diff),        // eax
00685 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
00686                     "2" (len),         // ecx
00687                     "1" (mask)         // edx
00688 
00689 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
00690                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
00691 #endif
00692                );
00693             }
00694             else /* mmx _not supported - Use modified C routine */
00695 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00696             {
00697                register png_uint_32 i;
00698                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
00699                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
00700                register int stride = png_pass_inc[png_ptr->pass];
00701                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
00702                register int rep_bytes = png_pass_width[png_ptr->pass];
00703                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
00704                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
00705                int diff = (int) (png_ptr->width & 7); /* amount lost */
00706                register png_uint_32 final_val = len;  /* GRR bugfix */
00707 
00708                srcptr = png_ptr->row_buf + 1 + initial_val;
00709                dstptr = row + initial_val;
00710 
00711                for (i = initial_val; i < final_val; i += stride)
00712                {
00713                   png_memcpy(dstptr, srcptr, rep_bytes);
00714                   srcptr += stride;
00715                   dstptr += stride;
00716                }
00717                if (diff)  /* number of leftover pixels:  3 for pngtest */
00718                {
00719                   final_val+=diff /* *BPP1 */ ;
00720                   for (; i < final_val; i += stride)
00721                   {
00722                      if (rep_bytes > (int)(final_val-i))
00723                         rep_bytes = (int)(final_val-i);
00724                      png_memcpy(dstptr, srcptr, rep_bytes);
00725                      srcptr += stride;
00726                      dstptr += stride;
00727                   }
00728                }
00729 
00730             } /* end of else (_mmx_supported) */
00731 
00732             break;
00733          }       /* end 8 bpp */
00734 
00735          case 16:       /* png_ptr->row_info.pixel_depth */
00736          {
00737             png_bytep srcptr;
00738             png_bytep dstptr;
00739 
00740 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00741 #if !defined(PNG_1_0_X)
00742             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00743                 /* && _mmx_supported */ )
00744 #else
00745             if (_mmx_supported)
00746 #endif
00747             {
00748                png_uint_32 len;
00749                int diff;
00750                int dummy_value_a;   // fix 'forbidden register spilled' error
00751                int dummy_value_d;
00752                int dummy_value_c;
00753                int dummy_value_S;
00754                int dummy_value_D;
00755                _unmask = ~mask;            // global variable for -fPIC version
00756                srcptr = png_ptr->row_buf + 1;
00757                dstptr = row;
00758                len  = png_ptr->width &~7;  // reduce to multiple of 8
00759                diff = (int) (png_ptr->width & 7); // amount lost //
00760 
00761                __asm__ __volatile__ (
00762                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
00763                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
00764                   "punpcklbw %%mm7, %%mm7     \n\t"
00765                   "punpcklwd %%mm7, %%mm7     \n\t"
00766                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
00767 
00768                   "movq      _mask16_0, %%mm0 \n\t"
00769                   "movq      _mask16_1, %%mm1 \n\t"
00770 
00771                   "pand      %%mm7, %%mm0     \n\t"
00772                   "pand      %%mm7, %%mm1     \n\t"
00773 
00774                   "pcmpeqb   %%mm6, %%mm0     \n\t"
00775                   "pcmpeqb   %%mm6, %%mm1     \n\t"
00776 
00777 // preload        "movl      len, %%ecx       \n\t" // load length of line
00778 // preload        "movl      srcptr, %%esi    \n\t" // load source
00779 // preload        "movl      dstptr, %%edi    \n\t" // load dest
00780 
00781                   "cmpl      $0, %%ecx        \n\t"
00782                   "jz        mainloop16end    \n\t"
00783 
00784                 "mainloop16:                  \n\t"
00785                   "movq      (%%esi), %%mm4   \n\t"
00786                   "pand      %%mm0, %%mm4     \n\t"
00787                   "movq      %%mm0, %%mm6     \n\t"
00788                   "movq      (%%edi), %%mm7   \n\t"
00789                   "pandn     %%mm7, %%mm6     \n\t"
00790                   "por       %%mm6, %%mm4     \n\t"
00791                   "movq      %%mm4, (%%edi)   \n\t"
00792 
00793                   "movq      8(%%esi), %%mm5  \n\t"
00794                   "pand      %%mm1, %%mm5     \n\t"
00795                   "movq      %%mm1, %%mm7     \n\t"
00796                   "movq      8(%%edi), %%mm6  \n\t"
00797                   "pandn     %%mm6, %%mm7     \n\t"
00798                   "por       %%mm7, %%mm5     \n\t"
00799                   "movq      %%mm5, 8(%%edi)  \n\t"
00800 
00801                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
00802                   "addl      $16, %%edi       \n\t"
00803                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
00804                   "ja        mainloop16       \n\t"
00805 
00806                 "mainloop16end:               \n\t"
00807 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
00808                   "movl      %%eax, %%ecx     \n\t"
00809                   "cmpl      $0, %%ecx        \n\t"
00810                   "jz        end16            \n\t"
00811 // preload        "movl      mask, %%edx      \n\t"
00812                   "sall      $24, %%edx       \n\t" // make low byte, high byte
00813 
00814                 "secondloop16:                \n\t"
00815                   "sall      %%edx            \n\t" // move high bit to CF
00816                   "jnc       skip16           \n\t" // if CF = 0
00817                   "movw      (%%esi), %%ax    \n\t"
00818                   "movw      %%ax, (%%edi)    \n\t"
00819 
00820                 "skip16:                      \n\t"
00821                   "addl      $2, %%esi        \n\t"
00822                   "addl      $2, %%edi        \n\t"
00823                   "decl      %%ecx            \n\t"
00824                   "jnz       secondloop16     \n\t"
00825 
00826                 "end16:                       \n\t"
00827                   "EMMS                       \n\t" // DONE
00828 
00829                   : "=a" (dummy_value_a),           // output regs (dummy)
00830                     "=c" (dummy_value_c),
00831                     "=d" (dummy_value_d),
00832                     "=S" (dummy_value_S),
00833                     "=D" (dummy_value_D)
00834 
00835                   : "0" (diff),        // eax       // input regs
00836 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
00837                     "1" (len),         // ecx
00838                     "2" (mask),        // edx
00839                     "3" (srcptr),      // esi
00840                     "4" (dstptr)       // edi
00841 
00842 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
00843                   : "%mm0", "%mm1", "%mm4"          // clobber list
00844                   , "%mm5", "%mm6", "%mm7"
00845 #endif
00846                );
00847             }
00848             else /* mmx _not supported - Use modified C routine */
00849 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
00850             {
00851                register png_uint_32 i;
00852                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
00853                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
00854                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
00855                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
00856                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
00857                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
00858                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
00859                int diff = (int) (png_ptr->width & 7); /* amount lost */
00860                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
00861 
00862                srcptr = png_ptr->row_buf + 1 + initial_val;
00863                dstptr = row + initial_val;
00864 
00865                for (i = initial_val; i < final_val; i += stride)
00866                {
00867                   png_memcpy(dstptr, srcptr, rep_bytes);
00868                   srcptr += stride;
00869                   dstptr += stride;
00870                }
00871                if (diff)  /* number of leftover pixels:  3 for pngtest */
00872                {
00873                   final_val+=diff*BPP2;
00874                   for (; i < final_val; i += stride)
00875                   {
00876                      if (rep_bytes > (int)(final_val-i))
00877                         rep_bytes = (int)(final_val-i);
00878                      png_memcpy(dstptr, srcptr, rep_bytes);
00879                      srcptr += stride;
00880                      dstptr += stride;
00881                   }
00882                }
00883             } /* end of else (_mmx_supported) */
00884 
00885             break;
00886          }       /* end 16 bpp */
00887 
00888          case 24:       /* png_ptr->row_info.pixel_depth */
00889          {
00890             png_bytep srcptr;
00891             png_bytep dstptr;
00892 
00893 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
00894 #if !defined(PNG_1_0_X)
00895             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00896                 /* && _mmx_supported */ )
00897 #else
00898             if (_mmx_supported)
00899 #endif
00900             {
00901                png_uint_32 len;
00902                int diff;
00903                int dummy_value_a;   // fix 'forbidden register spilled' error
00904                int dummy_value_d;
00905                int dummy_value_c;
00906                int dummy_value_S;
00907                int dummy_value_D;
00908                _unmask = ~mask;            // global variable for -fPIC version
00909                srcptr = png_ptr->row_buf + 1;
00910                dstptr = row;
00911                len  = png_ptr->width &~7;  // reduce to multiple of 8
00912                diff = (int) (png_ptr->width & 7); // amount lost //
00913 
00914                __asm__ __volatile__ (
00915                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
00916                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
00917                   "punpcklbw %%mm7, %%mm7     \n\t"
00918                   "punpcklwd %%mm7, %%mm7     \n\t"
00919                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
00920 
00921                   "movq      _mask24_0, %%mm0 \n\t"
00922                   "movq      _mask24_1, %%mm1 \n\t"
00923                   "movq      _mask24_2, %%mm2 \n\t"
00924 
00925                   "pand      %%mm7, %%mm0     \n\t"
00926                   "pand      %%mm7, %%mm1     \n\t"
00927                   "pand      %%mm7, %%mm2     \n\t"
00928 
00929                   "pcmpeqb   %%mm6, %%mm0     \n\t"
00930                   "pcmpeqb   %%mm6, %%mm1     \n\t"
00931                   "pcmpeqb   %%mm6, %%mm2     \n\t"
00932 
00933 // preload        "movl      len, %%ecx       \n\t" // load length of line
00934 // preload        "movl      srcptr, %%esi    \n\t" // load source
00935 // preload        "movl      dstptr, %%edi    \n\t" // load dest
00936 
00937                   "cmpl      $0, %%ecx        \n\t"
00938                   "jz        mainloop24end    \n\t"
00939 
00940                 "mainloop24:                  \n\t"
00941                   "movq      (%%esi), %%mm4   \n\t"
00942                   "pand      %%mm0, %%mm4     \n\t"
00943                   "movq      %%mm0, %%mm6     \n\t"
00944                   "movq      (%%edi), %%mm7   \n\t"
00945                   "pandn     %%mm7, %%mm6     \n\t"
00946                   "por       %%mm6, %%mm4     \n\t"
00947                   "movq      %%mm4, (%%edi)   \n\t"
00948 
00949                   "movq      8(%%esi), %%mm5  \n\t"
00950                   "pand      %%mm1, %%mm5     \n\t"
00951                   "movq      %%mm1, %%mm7     \n\t"
00952                   "movq      8(%%edi), %%mm6  \n\t"
00953                   "pandn     %%mm6, %%mm7     \n\t"
00954                   "por       %%mm7, %%mm5     \n\t"
00955                   "movq      %%mm5, 8(%%edi)  \n\t"
00956 
00957                   "movq      16(%%esi), %%mm6 \n\t"
00958                   "pand      %%mm2, %%mm6     \n\t"
00959                   "movq      %%mm2, %%mm4     \n\t"
00960                   "movq      16(%%edi), %%mm7 \n\t"
00961                   "pandn     %%mm7, %%mm4     \n\t"
00962                   "por       %%mm4, %%mm6     \n\t"
00963                   "movq      %%mm6, 16(%%edi) \n\t"
00964 
00965                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
00966                   "addl      $24, %%edi       \n\t"
00967                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
00968 
00969                   "ja        mainloop24       \n\t"
00970 
00971                 "mainloop24end:               \n\t"
00972 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
00973                   "movl      %%eax, %%ecx     \n\t"
00974                   "cmpl      $0, %%ecx        \n\t"
00975                   "jz        end24            \n\t"
00976 // preload        "movl      mask, %%edx      \n\t"
00977                   "sall      $24, %%edx       \n\t" // make low byte, high byte
00978 
00979                 "secondloop24:                \n\t"
00980                   "sall      %%edx            \n\t" // move high bit to CF
00981                   "jnc       skip24           \n\t" // if CF = 0
00982                   "movw      (%%esi), %%ax    \n\t"
00983                   "movw      %%ax, (%%edi)    \n\t"
00984                   "xorl      %%eax, %%eax     \n\t"
00985                   "movb      2(%%esi), %%al   \n\t"
00986                   "movb      %%al, 2(%%edi)   \n\t"
00987 
00988                 "skip24:                      \n\t"
00989                   "addl      $3, %%esi        \n\t"
00990                   "addl      $3, %%edi        \n\t"
00991                   "decl      %%ecx            \n\t"
00992                   "jnz       secondloop24     \n\t"
00993 
00994                 "end24:                       \n\t"
00995                   "EMMS                       \n\t" // DONE
00996 
00997                   : "=a" (dummy_value_a),           // output regs (dummy)
00998                     "=d" (dummy_value_d),
00999                     "=c" (dummy_value_c),
01000                     "=S" (dummy_value_S),
01001                     "=D" (dummy_value_D)
01002 
01003                   : "3" (srcptr),      // esi       // input regs
01004                     "4" (dstptr),      // edi
01005                     "0" (diff),        // eax
01006 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01007                     "2" (len),         // ecx
01008                     "1" (mask)         // edx
01009 
01010 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01011                   : "%mm0", "%mm1", "%mm2"          // clobber list
01012                   , "%mm4", "%mm5", "%mm6", "%mm7"
01013 #endif
01014                );
01015             }
01016             else /* mmx _not supported - Use modified C routine */
01017 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01018             {
01019                register png_uint_32 i;
01020                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
01021                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01022                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
01023                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01024                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
01025                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01026                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01027                int diff = (int) (png_ptr->width & 7); /* amount lost */
01028                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
01029 
01030                srcptr = png_ptr->row_buf + 1 + initial_val;
01031                dstptr = row + initial_val;
01032 
01033                for (i = initial_val; i < final_val; i += stride)
01034                {
01035                   png_memcpy(dstptr, srcptr, rep_bytes);
01036                   srcptr += stride;
01037                   dstptr += stride;
01038                }
01039                if (diff)  /* number of leftover pixels:  3 for pngtest */
01040                {
01041                   final_val+=diff*BPP3;
01042                   for (; i < final_val; i += stride)
01043                   {
01044                      if (rep_bytes > (int)(final_val-i))
01045                         rep_bytes = (int)(final_val-i);
01046                      png_memcpy(dstptr, srcptr, rep_bytes);
01047                      srcptr += stride;
01048                      dstptr += stride;
01049                   }
01050                }
01051             } /* end of else (_mmx_supported) */
01052 
01053             break;
01054          }       /* end 24 bpp */
01055 
01056          case 32:       /* png_ptr->row_info.pixel_depth */
01057          {
01058             png_bytep srcptr;
01059             png_bytep dstptr;
01060 
01061 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
01062 #if !defined(PNG_1_0_X)
01063             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
01064                 /* && _mmx_supported */ )
01065 #else
01066             if (_mmx_supported)
01067 #endif
01068             {
01069                png_uint_32 len;
01070                int diff;
01071                int dummy_value_a;   // fix 'forbidden register spilled' error
01072                int dummy_value_d;
01073                int dummy_value_c;
01074                int dummy_value_S;
01075                int dummy_value_D;
01076                _unmask = ~mask;            // global variable for -fPIC version
01077                srcptr = png_ptr->row_buf + 1;
01078                dstptr = row;
01079                len  = png_ptr->width &~7;  // reduce to multiple of 8
01080                diff = (int) (png_ptr->width & 7); // amount lost //
01081 
01082                __asm__ __volatile__ (
01083                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
01084                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
01085                   "punpcklbw %%mm7, %%mm7     \n\t"
01086                   "punpcklwd %%mm7, %%mm7     \n\t"
01087                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
01088 
01089                   "movq      _mask32_0, %%mm0 \n\t"
01090                   "movq      _mask32_1, %%mm1 \n\t"
01091                   "movq      _mask32_2, %%mm2 \n\t"
01092                   "movq      _mask32_3, %%mm3 \n\t"
01093 
01094                   "pand      %%mm7, %%mm0     \n\t"
01095                   "pand      %%mm7, %%mm1     \n\t"
01096                   "pand      %%mm7, %%mm2     \n\t"
01097                   "pand      %%mm7, %%mm3     \n\t"
01098 
01099                   "pcmpeqb   %%mm6, %%mm0     \n\t"
01100                   "pcmpeqb   %%mm6, %%mm1     \n\t"
01101                   "pcmpeqb   %%mm6, %%mm2     \n\t"
01102                   "pcmpeqb   %%mm6, %%mm3     \n\t"
01103 
01104 // preload        "movl      len, %%ecx       \n\t" // load length of line
01105 // preload        "movl      srcptr, %%esi    \n\t" // load source
01106 // preload        "movl      dstptr, %%edi    \n\t" // load dest
01107 
01108                   "cmpl      $0, %%ecx        \n\t" // lcr
01109                   "jz        mainloop32end    \n\t"
01110 
01111                 "mainloop32:                  \n\t"
01112                   "movq      (%%esi), %%mm4   \n\t"
01113                   "pand      %%mm0, %%mm4     \n\t"
01114                   "movq      %%mm0, %%mm6     \n\t"
01115                   "movq      (%%edi), %%mm7   \n\t"
01116                   "pandn     %%mm7, %%mm6     \n\t"
01117                   "por       %%mm6, %%mm4     \n\t"
01118                   "movq      %%mm4, (%%edi)   \n\t"
01119 
01120                   "movq      8(%%esi), %%mm5  \n\t"
01121                   "pand      %%mm1, %%mm5     \n\t"
01122                   "movq      %%mm1, %%mm7     \n\t"
01123                   "movq      8(%%edi), %%mm6  \n\t"
01124                   "pandn     %%mm6, %%mm7     \n\t"
01125                   "por       %%mm7, %%mm5     \n\t"
01126                   "movq      %%mm5, 8(%%edi)  \n\t"
01127 
01128                   "movq      16(%%esi), %%mm6 \n\t"
01129                   "pand      %%mm2, %%mm6     \n\t"
01130                   "movq      %%mm2, %%mm4     \n\t"
01131                   "movq      16(%%edi), %%mm7 \n\t"
01132                   "pandn     %%mm7, %%mm4     \n\t"
01133                   "por       %%mm4, %%mm6     \n\t"
01134                   "movq      %%mm6, 16(%%edi) \n\t"
01135 
01136                   "movq      24(%%esi), %%mm7 \n\t"
01137                   "pand      %%mm3, %%mm7     \n\t"
01138                   "movq      %%mm3, %%mm5     \n\t"
01139                   "movq      24(%%edi), %%mm4 \n\t"
01140                   "pandn     %%mm4, %%mm5     \n\t"
01141                   "por       %%mm5, %%mm7     \n\t"
01142                   "movq      %%mm7, 24(%%edi) \n\t"
01143 
01144                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
01145                   "addl      $32, %%edi       \n\t"
01146                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
01147                   "ja        mainloop32       \n\t"
01148 
01149                 "mainloop32end:               \n\t"
01150 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
01151                   "movl      %%eax, %%ecx     \n\t"
01152                   "cmpl      $0, %%ecx        \n\t"
01153                   "jz        end32            \n\t"
01154 // preload        "movl      mask, %%edx      \n\t"
01155                   "sall      $24, %%edx       \n\t" // low byte => high byte
01156 
01157                 "secondloop32:                \n\t"
01158                   "sall      %%edx            \n\t" // move high bit to CF
01159                   "jnc       skip32           \n\t" // if CF = 0
01160                   "movl      (%%esi), %%eax   \n\t"
01161                   "movl      %%eax, (%%edi)   \n\t"
01162 
01163                 "skip32:                      \n\t"
01164                   "addl      $4, %%esi        \n\t"
01165                   "addl      $4, %%edi        \n\t"
01166                   "decl      %%ecx            \n\t"
01167                   "jnz       secondloop32     \n\t"
01168 
01169                 "end32:                       \n\t"
01170                   "EMMS                       \n\t" // DONE
01171 
01172                   : "=a" (dummy_value_a),           // output regs (dummy)
01173                     "=d" (dummy_value_d),
01174                     "=c" (dummy_value_c),
01175                     "=S" (dummy_value_S),
01176                     "=D" (dummy_value_D)
01177 
01178                   : "3" (srcptr),      // esi       // input regs
01179                     "4" (dstptr),      // edi
01180                     "0" (diff),        // eax
01181 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01182                     "2" (len),         // ecx
01183                     "1" (mask)         // edx
01184 
01185 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01186                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
01187                   , "%mm4", "%mm5", "%mm6", "%mm7"
01188 #endif
01189                );
01190             }
01191             else /* mmx _not supported - Use modified C routine */
01192 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01193             {
01194                register png_uint_32 i;
01195                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
01196                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01197                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
01198                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01199                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
01200                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01201                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01202                int diff = (int) (png_ptr->width & 7); /* amount lost */
01203                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
01204 
01205                srcptr = png_ptr->row_buf + 1 + initial_val;
01206                dstptr = row + initial_val;
01207 
01208                for (i = initial_val; i < final_val; i += stride)
01209                {
01210                   png_memcpy(dstptr, srcptr, rep_bytes);
01211                   srcptr += stride;
01212                   dstptr += stride;
01213                }
01214                if (diff)  /* number of leftover pixels:  3 for pngtest */
01215                {
01216                   final_val+=diff*BPP4;
01217                   for (; i < final_val; i += stride)
01218                   {
01219                      if (rep_bytes > (int)(final_val-i))
01220                         rep_bytes = (int)(final_val-i);
01221                      png_memcpy(dstptr, srcptr, rep_bytes);
01222                      srcptr += stride;
01223                      dstptr += stride;
01224                   }
01225                }
01226             } /* end of else (_mmx_supported) */
01227 
01228             break;
01229          }       /* end 32 bpp */
01230 
01231          case 48:       /* png_ptr->row_info.pixel_depth */
01232          {
01233             png_bytep srcptr;
01234             png_bytep dstptr;
01235 
01236 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
01237 #if !defined(PNG_1_0_X)
01238             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
01239                 /* && _mmx_supported */ )
01240 #else
01241             if (_mmx_supported)
01242 #endif
01243             {
01244                png_uint_32 len;
01245                int diff;
01246                int dummy_value_a;   // fix 'forbidden register spilled' error
01247                int dummy_value_d;
01248                int dummy_value_c;
01249                int dummy_value_S;
01250                int dummy_value_D;
01251                _unmask = ~mask;            // global variable for -fPIC version
01252                srcptr = png_ptr->row_buf + 1;
01253                dstptr = row;
01254                len  = png_ptr->width &~7;  // reduce to multiple of 8
01255                diff = (int) (png_ptr->width & 7); // amount lost //
01256 
01257                __asm__ __volatile__ (
01258                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
01259                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
01260                   "punpcklbw %%mm7, %%mm7     \n\t"
01261                   "punpcklwd %%mm7, %%mm7     \n\t"
01262                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
01263 
01264                   "movq      _mask48_0, %%mm0 \n\t"
01265                   "movq      _mask48_1, %%mm1 \n\t"
01266                   "movq      _mask48_2, %%mm2 \n\t"
01267                   "movq      _mask48_3, %%mm3 \n\t"
01268                   "movq      _mask48_4, %%mm4 \n\t"
01269                   "movq      _mask48_5, %%mm5 \n\t"
01270 
01271                   "pand      %%mm7, %%mm0     \n\t"
01272                   "pand      %%mm7, %%mm1     \n\t"
01273                   "pand      %%mm7, %%mm2     \n\t"
01274                   "pand      %%mm7, %%mm3     \n\t"
01275                   "pand      %%mm7, %%mm4     \n\t"
01276                   "pand      %%mm7, %%mm5     \n\t"
01277 
01278                   "pcmpeqb   %%mm6, %%mm0     \n\t"
01279                   "pcmpeqb   %%mm6, %%mm1     \n\t"
01280                   "pcmpeqb   %%mm6, %%mm2     \n\t"
01281                   "pcmpeqb   %%mm6, %%mm3     \n\t"
01282                   "pcmpeqb   %%mm6, %%mm4     \n\t"
01283                   "pcmpeqb   %%mm6, %%mm5     \n\t"
01284 
01285 // preload        "movl      len, %%ecx       \n\t" // load length of line
01286 // preload        "movl      srcptr, %%esi    \n\t" // load source
01287 // preload        "movl      dstptr, %%edi    \n\t" // load dest
01288 
01289                   "cmpl      $0, %%ecx        \n\t"
01290                   "jz        mainloop48end    \n\t"
01291 
01292                 "mainloop48:                  \n\t"
01293                   "movq      (%%esi), %%mm7   \n\t"
01294                   "pand      %%mm0, %%mm7     \n\t"
01295                   "movq      %%mm0, %%mm6     \n\t"
01296                   "pandn     (%%edi), %%mm6   \n\t"
01297                   "por       %%mm6, %%mm7     \n\t"
01298                   "movq      %%mm7, (%%edi)   \n\t"
01299 
01300                   "movq      8(%%esi), %%mm6  \n\t"
01301                   "pand      %%mm1, %%mm6     \n\t"
01302                   "movq      %%mm1, %%mm7     \n\t"
01303                   "pandn     8(%%edi), %%mm7  \n\t"
01304                   "por       %%mm7, %%mm6     \n\t"
01305                   "movq      %%mm6, 8(%%edi)  \n\t"
01306 
01307                   "movq      16(%%esi), %%mm6 \n\t"
01308                   "pand      %%mm2, %%mm6     \n\t"
01309                   "movq      %%mm2, %%mm7     \n\t"
01310                   "pandn     16(%%edi), %%mm7 \n\t"
01311                   "por       %%mm7, %%mm6     \n\t"
01312                   "movq      %%mm6, 16(%%edi) \n\t"
01313 
01314                   "movq      24(%%esi), %%mm7 \n\t"
01315                   "pand      %%mm3, %%mm7     \n\t"
01316                   "movq      %%mm3, %%mm6     \n\t"
01317                   "pandn     24(%%edi), %%mm6 \n\t"
01318                   "por       %%mm6, %%mm7     \n\t"
01319                   "movq      %%mm7, 24(%%edi) \n\t"
01320 
01321                   "movq      32(%%esi), %%mm6 \n\t"
01322                   "pand      %%mm4, %%mm6     \n\t"
01323                   "movq      %%mm4, %%mm7     \n\t"
01324                   "pandn     32(%%edi), %%mm7 \n\t"
01325                   "por       %%mm7, %%mm6     \n\t"
01326                   "movq      %%mm6, 32(%%edi) \n\t"
01327 
01328                   "movq      40(%%esi), %%mm7 \n\t"
01329                   "pand      %%mm5, %%mm7     \n\t"
01330                   "movq      %%mm5, %%mm6     \n\t"
01331                   "pandn     40(%%edi), %%mm6 \n\t"
01332                   "por       %%mm6, %%mm7     \n\t"
01333                   "movq      %%mm7, 40(%%edi) \n\t"
01334 
01335                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
01336                   "addl      $48, %%edi       \n\t"
01337                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
01338 
01339                   "ja        mainloop48       \n\t"
01340 
01341                 "mainloop48end:               \n\t"
01342 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
01343                   "movl      %%eax, %%ecx     \n\t"
01344                   "cmpl      $0, %%ecx        \n\t"
01345                   "jz        end48            \n\t"
01346 // preload        "movl      mask, %%edx      \n\t"
01347                   "sall      $24, %%edx       \n\t" // make low byte, high byte
01348 
01349                 "secondloop48:                \n\t"
01350                   "sall      %%edx            \n\t" // move high bit to CF
01351                   "jnc       skip48           \n\t" // if CF = 0
01352                   "movl      (%%esi), %%eax   \n\t"
01353                   "movl      %%eax, (%%edi)   \n\t"
01354 
01355                 "skip48:                      \n\t"
01356                   "addl      $4, %%esi        \n\t"
01357                   "addl      $4, %%edi        \n\t"
01358                   "decl      %%ecx            \n\t"
01359                   "jnz       secondloop48     \n\t"
01360 
01361                 "end48:                       \n\t"
01362                   "EMMS                       \n\t" // DONE
01363 
01364                   : "=a" (dummy_value_a),           // output regs (dummy)
01365                     "=d" (dummy_value_d),
01366                     "=c" (dummy_value_c),
01367                     "=S" (dummy_value_S),
01368                     "=D" (dummy_value_D)
01369 
01370                   : "3" (srcptr),      // esi       // input regs
01371                     "4" (dstptr),      // edi
01372                     "0" (diff),        // eax
01373 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
01374                     "2" (len),         // ecx
01375                     "1" (mask)         // edx
01376 
01377 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
01378                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
01379                   , "%mm4", "%mm5", "%mm6", "%mm7"
01380 #endif
01381                );
01382             }
01383             else /* mmx _not supported - Use modified C routine */
01384 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
01385             {
01386                register png_uint_32 i;
01387                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
01388                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01389                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
01390                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01391                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
01392                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01393                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01394                int diff = (int) (png_ptr->width & 7); /* amount lost */
01395                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
01396 
01397                srcptr = png_ptr->row_buf + 1 + initial_val;
01398                dstptr = row + initial_val;
01399 
01400                for (i = initial_val; i < final_val; i += stride)
01401                {
01402                   png_memcpy(dstptr, srcptr, rep_bytes);
01403                   srcptr += stride;
01404                   dstptr += stride;
01405                }
01406                if (diff)  /* number of leftover pixels:  3 for pngtest */
01407                {
01408                   final_val+=diff*BPP6;
01409                   for (; i < final_val; i += stride)
01410                   {
01411                      if (rep_bytes > (int)(final_val-i))
01412                         rep_bytes = (int)(final_val-i);
01413                      png_memcpy(dstptr, srcptr, rep_bytes);
01414                      srcptr += stride;
01415                      dstptr += stride;
01416                   }
01417                }
01418             } /* end of else (_mmx_supported) */
01419 
01420             break;
01421          }       /* end 48 bpp */
01422 
01423          case 64:       /* png_ptr->row_info.pixel_depth */
01424          {
01425             png_bytep srcptr;
01426             png_bytep dstptr;
01427             register png_uint_32 i;
01428             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
01429               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
01430             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
01431               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
01432             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
01433               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
01434             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
01435             int diff = (int) (png_ptr->width & 7); /* amount lost */
01436             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
01437 
01438             srcptr = png_ptr->row_buf + 1 + initial_val;
01439             dstptr = row + initial_val;
01440 
01441             for (i = initial_val; i < final_val; i += stride)
01442             {
01443                png_memcpy(dstptr, srcptr, rep_bytes);
01444                srcptr += stride;
01445                dstptr += stride;
01446             }
01447             if (diff)  /* number of leftover pixels:  3 for pngtest */
01448             {
01449                final_val+=diff*BPP8;
01450                for (; i < final_val; i += stride)
01451                {
01452                   if (rep_bytes > (int)(final_val-i))
01453                      rep_bytes = (int)(final_val-i);
01454                   png_memcpy(dstptr, srcptr, rep_bytes);
01455                   srcptr += stride;
01456                   dstptr += stride;
01457                }
01458             }
01459 
01460             break;
01461          }       /* end 64 bpp */
01462 
01463          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
01464          {
01465             /* this should never happen */
01466             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
01467             break;
01468          }
01469       } /* end switch (png_ptr->row_info.pixel_depth) */
01470 
01471    } /* end if (non-trivial mask) */
01472 
01473 } /* end png_combine_row() */
01474 
01475 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
01476 
01477 
01478 
01479 
01480 /*===========================================================================*/
01481 /*                                                                           */
01482 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
01483 /*                                                                           */
01484 /*===========================================================================*/
01485 
01486 #if defined(PNG_READ_INTERLACING_SUPPORTED)
01487 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
01488 
01489 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
01490  * has taken place.  [GRR: what other steps come before and/or after?]
01491  */
01492 
01493 void /* PRIVATE */
01494 png_do_read_interlace(png_structp png_ptr)
01495 {
01496    png_row_infop row_info = &(png_ptr->row_info);
01497    png_bytep row = png_ptr->row_buf + 1;
01498    int pass = png_ptr->pass;
01499 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01500    png_uint_32 transformations = png_ptr->transformations;
01501 #endif
01502 
01503    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
01504 
01505 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
01506    if (_mmx_supported == 2) {
01507 #if !defined(PNG_1_0_X)
01508        /* this should have happened in png_init_mmx_flags() already */
01509        png_warning(png_ptr, "asm_flags may not have been initialized");
01510 #endif
01511        png_mmx_support();
01512    }
01513 #endif
01514 
01515    if (row != NULL && row_info != NULL)
01516    {
01517       png_uint_32 final_width;
01518 
01519       final_width = row_info->width * png_pass_inc[pass];
01520 
01521       switch (row_info->pixel_depth)
01522       {
01523          case 1:
01524          {
01525             png_bytep sp, dp;
01526             int sshift, dshift;
01527             int s_start, s_end, s_inc;
01528             png_byte v;
01529             png_uint_32 i;
01530             int j;
01531 
01532             sp = row + (png_size_t)((row_info->width - 1) >> 3);
01533             dp = row + (png_size_t)((final_width - 1) >> 3);
01534 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01535             if (transformations & PNG_PACKSWAP)
01536             {
01537                sshift = (int)((row_info->width + 7) & 7);
01538                dshift = (int)((final_width + 7) & 7);
01539                s_start = 7;
01540                s_end = 0;
01541                s_inc = -1;
01542             }
01543             else
01544 #endif
01545             {
01546                sshift = 7 - (int)((row_info->width + 7) & 7);
01547                dshift = 7 - (int)((final_width + 7) & 7);
01548                s_start = 0;
01549                s_end = 7;
01550                s_inc = 1;
01551             }
01552 
01553             for (i = row_info->width; i; i--)
01554             {
01555                v = (png_byte)((*sp >> sshift) & 0x1);
01556                for (j = 0; j < png_pass_inc[pass]; j++)
01557                {
01558                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
01559                   *dp |= (png_byte)(v << dshift);
01560                   if (dshift == s_end)
01561                   {
01562                      dshift = s_start;
01563                      dp--;
01564                   }
01565                   else
01566                      dshift += s_inc;
01567                }
01568                if (sshift == s_end)
01569                {
01570                   sshift = s_start;
01571                   sp--;
01572                }
01573                else
01574                   sshift += s_inc;
01575             }
01576             break;
01577          }
01578 
01579          case 2:
01580          {
01581             png_bytep sp, dp;
01582             int sshift, dshift;
01583             int s_start, s_end, s_inc;
01584             png_uint_32 i;
01585 
01586             sp = row + (png_size_t)((row_info->width - 1) >> 2);
01587             dp = row + (png_size_t)((final_width - 1) >> 2);
01588 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01589             if (transformations & PNG_PACKSWAP)
01590             {
01591                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
01592                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
01593                s_start = 6;
01594                s_end = 0;
01595                s_inc = -2;
01596             }
01597             else
01598 #endif
01599             {
01600                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
01601                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
01602                s_start = 0;
01603                s_end = 6;
01604                s_inc = 2;
01605             }
01606 
01607             for (i = row_info->width; i; i--)
01608             {
01609                png_byte v;
01610                int j;
01611 
01612                v = (png_byte)((*sp >> sshift) & 0x3);
01613                for (j = 0; j < png_pass_inc[pass]; j++)
01614                {
01615                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
01616                   *dp |= (png_byte)(v << dshift);
01617                   if (dshift == s_end)
01618                   {
01619                      dshift = s_start;
01620                      dp--;
01621                   }
01622                   else
01623                      dshift += s_inc;
01624                }
01625                if (sshift == s_end)
01626                {
01627                   sshift = s_start;
01628                   sp--;
01629                }
01630                else
01631                   sshift += s_inc;
01632             }
01633             break;
01634          }
01635 
01636          case 4:
01637          {
01638             png_bytep sp, dp;
01639             int sshift, dshift;
01640             int s_start, s_end, s_inc;
01641             png_uint_32 i;
01642 
01643             sp = row + (png_size_t)((row_info->width - 1) >> 1);
01644             dp = row + (png_size_t)((final_width - 1) >> 1);
01645 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01646             if (transformations & PNG_PACKSWAP)
01647             {
01648                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
01649                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
01650                s_start = 4;
01651                s_end = 0;
01652                s_inc = -4;
01653             }
01654             else
01655 #endif
01656             {
01657                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
01658                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
01659                s_start = 0;
01660                s_end = 4;
01661                s_inc = 4;
01662             }
01663 
01664             for (i = row_info->width; i; i--)
01665             {
01666                png_byte v;
01667                int j;
01668 
01669                v = (png_byte)((*sp >> sshift) & 0xf);
01670                for (j = 0; j < png_pass_inc[pass]; j++)
01671                {
01672                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
01673                   *dp |= (png_byte)(v << dshift);
01674                   if (dshift == s_end)
01675                   {
01676                      dshift = s_start;
01677                      dp--;
01678                   }
01679                   else
01680                      dshift += s_inc;
01681                }
01682                if (sshift == s_end)
01683                {
01684                   sshift = s_start;
01685                   sp--;
01686                }
01687                else
01688                   sshift += s_inc;
01689             }
01690             break;
01691          }
01692 
01693        /*====================================================================*/
01694 
01695          default: /* 8-bit or larger (this is where the routine is modified) */
01696          {
01697 #if 0
01698 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
01699 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
01700 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
01701 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
01702 #endif
01703             png_bytep sptr, dp;
01704             png_uint_32 i;
01705             png_size_t pixel_bytes;
01706             int width = (int)row_info->width;
01707 
01708             pixel_bytes = (row_info->pixel_depth >> 3);
01709 
01710             /* point sptr at the last pixel in the pre-expanded row: */
01711             sptr = row + (width - 1) * pixel_bytes;
01712 
01713             /* point dp at the last pixel position in the expanded row: */
01714             dp = row + (final_width - 1) * pixel_bytes;
01715 
01716             /* New code by Nirav Chhatrapati - Intel Corporation */
01717 
01718 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
01719 #if !defined(PNG_1_0_X)
01720             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
01721                 /* && _mmx_supported */ )
01722 #else
01723             if (_mmx_supported)
01724 #endif
01725             {
01726                //--------------------------------------------------------------
01727                if (pixel_bytes == 3)
01728                {
01729                   if (((pass == 0) || (pass == 1)) && width)
01730                   {
01731                      int dummy_value_c;   // fix 'forbidden register spilled'
01732                      int dummy_value_S;
01733                      int dummy_value_D;
01734 
01735                      __asm__ __volatile__ (
01736                         "subl $21, %%edi         \n\t"
01737                                      // (png_pass_inc[pass] - 1)*pixel_bytes
01738 
01739                      ".loop3_pass0:              \n\t"
01740                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
01741                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
01742                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
01743                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
01744                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
01745                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
01746                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
01747                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
01748                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
01749                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
01750                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
01751                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
01752                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
01753                         "movq %%mm4, 16(%%edi)   \n\t"
01754                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
01755                         "movq %%mm3, 8(%%edi)    \n\t"
01756                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
01757                         "subl $3, %%esi          \n\t"
01758                         "movq %%mm0, (%%edi)     \n\t"
01759                         "subl $24, %%edi         \n\t"
01760                         "decl %%ecx              \n\t"
01761                         "jnz .loop3_pass0        \n\t"
01762                         "EMMS                    \n\t" // DONE
01763 
01764                         : "=c" (dummy_value_c),        // output regs (dummy)
01765                           "=S" (dummy_value_S),
01766                           "=D" (dummy_value_D)
01767 
01768                         : "1" (sptr),      // esi      // input regs
01769                           "2" (dp),        // edi
01770                           "0" (width)      // ecx
01771 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
01772 
01773 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
01774                         : "%mm0", "%mm1", "%mm2"       // clobber list
01775                         , "%mm3", "%mm4"
01776 #endif
01777                      );
01778                   }
01779                   else if (((pass == 2) || (pass == 3)) && width)
01780                   {
01781                      int dummy_value_c;   // fix 'forbidden register spilled'
01782                      int dummy_value_S;
01783                      int dummy_value_D;
01784 
01785                      __asm__ __volatile__ (
01786                         "subl $9, %%edi          \n\t"
01787                                      // (png_pass_inc[pass] - 1)*pixel_bytes
01788 
01789                      ".loop3_pass2:              \n\t"
01790                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
01791                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
01792                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
01793                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
01794                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
01795                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
01796                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
01797                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
01798                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
01799                         "movq %%mm0, 4(%%edi)    \n\t"
01800                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
01801                         "subl $3, %%esi          \n\t"
01802                         "movd %%mm0, (%%edi)     \n\t"
01803                         "subl $12, %%edi         \n\t"
01804                         "decl %%ecx              \n\t"
01805                         "jnz .loop3_pass2        \n\t"
01806                         "EMMS                    \n\t" // DONE
01807 
01808                         : "=c" (dummy_value_c),        // output regs (dummy)
01809                           "=S" (dummy_value_S),
01810                           "=D" (dummy_value_D)
01811 
01812                         : "1" (sptr),      // esi      // input regs
01813                           "2" (dp),        // edi
01814                           "0" (width)      // ecx
01815 
01816 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
01817                         : "%mm0", "%mm1", "%mm2"       // clobber list
01818 #endif
01819                      );
01820                   }
01821                   else if (width) /* && ((pass == 4) || (pass == 5)) */
01822                   {
01823                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
01824                      if (width_mmx < 0)
01825                          width_mmx = 0;
01826                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
01827                      if (width_mmx)
01828                      {
01829                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
01830                         // sptr points at last pixel in pre-expanded row
01831                         // dp points at last pixel position in expanded row
01832                         int dummy_value_c;  // fix 'forbidden register spilled'
01833                         int dummy_value_S;
01834                         int dummy_value_D;
01835 
01836                         __asm__ __volatile__ (
01837                            "subl $3, %%esi          \n\t"
01838                            "subl $9, %%edi          \n\t"
01839                                         // (png_pass_inc[pass] + 1)*pixel_bytes
01840 
01841                         ".loop3_pass4:              \n\t"
01842                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
01843                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
01844                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
01845                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
01846                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
01847                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
01848                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
01849                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
01850                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
01851                            "movq %%mm0, (%%edi)     \n\t"
01852                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
01853                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
01854                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
01855                            "subl $6, %%esi          \n\t"
01856                            "movd %%mm2, 8(%%edi)    \n\t"
01857                            "subl $12, %%edi         \n\t"
01858                            "subl $2, %%ecx          \n\t"
01859                            "jnz .loop3_pass4        \n\t"
01860                            "EMMS                    \n\t" // DONE
01861 
01862                            : "=c" (dummy_value_c),        // output regs (dummy)
01863                              "=S" (dummy_value_S),
01864                              "=D" (dummy_value_D)
01865 
01866                            : "1" (sptr),      // esi      // input regs
01867                              "2" (dp),        // edi
01868                              "0" (width_mmx)  // ecx
01869 
01870 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
01871                            : "%mm0", "%mm1"               // clobber list
01872                            , "%mm2", "%mm3"
01873 #endif
01874                         );
01875                      }
01876 
01877                      sptr -= width_mmx*3;
01878                      dp -= width_mmx*6;
01879                      for (i = width; i; i--)
01880                      {
01881                         png_byte v[8];
01882                         int j;
01883 
01884                         png_memcpy(v, sptr, 3);
01885                         for (j = 0; j < png_pass_inc[pass]; j++)
01886                         {
01887                            png_memcpy(dp, v, 3);
01888                            dp -= 3;
01889                         }
01890                         sptr -= 3;
01891                      }
01892                   }
01893                } /* end of pixel_bytes == 3 */
01894 
01895                //--------------------------------------------------------------
01896                else if (pixel_bytes == 1)
01897                {
01898                   if (((pass == 0) || (pass == 1)) && width)
01899                   {
01900                      int width_mmx = ((width >> 2) << 2);
01901                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
01902                      if (width_mmx)
01903                      {
01904                         int dummy_value_c;  // fix 'forbidden register spilled'
01905                         int dummy_value_S;
01906                         int dummy_value_D;
01907 
01908                         __asm__ __volatile__ (
01909                            "subl $3, %%esi          \n\t"
01910                            "subl $31, %%edi         \n\t"
01911 
01912                         ".loop1_pass0:              \n\t"
01913                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
01914                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
01915                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
01916                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
01917                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
01918                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
01919                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
01920                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
01921                            "movq %%mm0, (%%edi)     \n\t"
01922                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
01923                            "movq %%mm3, 8(%%edi)    \n\t"
01924                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
01925                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
01926                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
01927                            "movq %%mm2, 16(%%edi)   \n\t"
01928                            "subl $4, %%esi          \n\t"
01929                            "movq %%mm4, 24(%%edi)   \n\t"
01930                            "subl $32, %%edi         \n\t"
01931                            "subl $4, %%ecx          \n\t"
01932                            "jnz .loop1_pass0        \n\t"
01933                            "EMMS                    \n\t" // DONE
01934 
01935                            : "=c" (dummy_value_c),        // output regs (dummy)
01936                              "=S" (dummy_value_S),
01937                              "=D" (dummy_value_D)
01938 
01939                            : "1" (sptr),      // esi      // input regs
01940                              "2" (dp),        // edi
01941                              "0" (width_mmx)  // ecx
01942 
01943 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
01944                            : "%mm0", "%mm1", "%mm2"       // clobber list
01945                            , "%mm3", "%mm4"
01946 #endif
01947                         );
01948                      }
01949 
01950                      sptr -= width_mmx;
01951                      dp -= width_mmx*8;
01952                      for (i = width; i; i--)
01953                      {
01954                         int j;
01955 
01956                        /* I simplified this part in version 1.0.4e
01957                         * here and in several other instances where
01958                         * pixel_bytes == 1  -- GR-P
01959                         *
01960                         * Original code:
01961                         *
01962                         * png_byte v[8];
01963                         * png_memcpy(v, sptr, pixel_bytes);
01964                         * for (j = 0; j < png_pass_inc[pass]; j++)
01965                         * {
01966                         *    png_memcpy(dp, v, pixel_bytes);
01967                         *    dp -= pixel_bytes;
01968                         * }
01969                         * sptr -= pixel_bytes;
01970                         *
01971                         * Replacement code is in the next three lines:
01972                         */
01973 
01974                         for (j = 0; j < png_pass_inc[pass]; j++)
01975                         {
01976                            *dp-- = *sptr;
01977                         }
01978                         --sptr;
01979                      }
01980                   }
01981                   else if (((pass == 2) || (pass == 3)) && width)
01982                   {
01983                      int width_mmx = ((width >> 2) << 2);
01984                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
01985                      if (width_mmx)
01986                      {
01987                         int dummy_value_c;  // fix 'forbidden register spilled'
01988                         int dummy_value_S;
01989                         int dummy_value_D;
01990 
01991                         __asm__ __volatile__ (
01992                            "subl $3, %%esi          \n\t"
01993                            "subl $15, %%edi         \n\t"
01994 
01995                         ".loop1_pass2:              \n\t"
01996                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
01997                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
01998                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
01999                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
02000                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
02001                            "movq %%mm0, (%%edi)     \n\t"
02002                            "subl $4, %%esi          \n\t"
02003                            "movq %%mm1, 8(%%edi)    \n\t"
02004                            "subl $16, %%edi         \n\t"
02005                            "subl $4, %%ecx          \n\t"
02006                            "jnz .loop1_pass2        \n\t"
02007                            "EMMS                    \n\t" // DONE
02008 
02009                            : "=c" (dummy_value_c),        // output regs (dummy)
02010                              "=S" (dummy_value_S),
02011                              "=D" (dummy_value_D)
02012 
02013                            : "1" (sptr),      // esi      // input regs
02014                              "2" (dp),        // edi
02015                              "0" (width_mmx)  // ecx
02016 
02017 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02018                            : "%mm0", "%mm1"               // clobber list
02019 #endif
02020                         );
02021                      }
02022 
02023                      sptr -= width_mmx;
02024                      dp -= width_mmx*4;
02025                      for (i = width; i; i--)
02026                      {
02027                         int j;
02028 
02029                         for (j = 0; j < png_pass_inc[pass]; j++)
02030                         {
02031                            *dp-- = *sptr;
02032                         }
02033                         --sptr;
02034                      }
02035                   }
02036                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
02037                   {
02038                      int width_mmx = ((width >> 3) << 3);
02039                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
02040                      if (width_mmx)
02041                      {
02042                         int dummy_value_c;  // fix 'forbidden register spilled'
02043                         int dummy_value_S;
02044                         int dummy_value_D;
02045 
02046                         __asm__ __volatile__ (
02047                            "subl $7, %%esi          \n\t"
02048                            "subl $15, %%edi         \n\t"
02049 
02050                         ".loop1_pass4:              \n\t"
02051                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02052                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02053                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
02054                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
02055                            "movq %%mm1, 8(%%edi)    \n\t"
02056                            "subl $8, %%esi          \n\t"
02057                            "movq %%mm0, (%%edi)     \n\t"
02058                            "subl $16, %%edi         \n\t"
02059                            "subl $8, %%ecx          \n\t"
02060                            "jnz .loop1_pass4        \n\t"
02061                            "EMMS                    \n\t" // DONE
02062 
02063                            : "=c" (dummy_value_c),        // output regs (none)
02064                              "=S" (dummy_value_S),
02065                              "=D" (dummy_value_D)
02066 
02067                            : "1" (sptr),      // esi      // input regs
02068                              "2" (dp),        // edi
02069                              "0" (width_mmx)  // ecx
02070 
02071 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02072                            : "%mm0", "%mm1"               // clobber list
02073 #endif
02074                         );
02075                      }
02076 
02077                      sptr -= width_mmx;
02078                      dp -= width_mmx*2;
02079                      for (i = width; i; i--)
02080                      {
02081                         int j;
02082 
02083                         for (j = 0; j < png_pass_inc[pass]; j++)
02084                         {
02085                            *dp-- = *sptr;
02086                         }
02087                         --sptr;
02088                      }
02089                   }
02090                } /* end of pixel_bytes == 1 */
02091 
02092                //--------------------------------------------------------------
02093                else if (pixel_bytes == 2)
02094                {
02095                   if (((pass == 0) || (pass == 1)) && width)
02096                   {
02097                      int width_mmx = ((width >> 1) << 1);
02098                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02099                      if (width_mmx)
02100                      {
02101                         int dummy_value_c;  // fix 'forbidden register spilled'
02102                         int dummy_value_S;
02103                         int dummy_value_D;
02104 
02105                         __asm__ __volatile__ (
02106                            "subl $2, %%esi          \n\t"
02107                            "subl $30, %%edi         \n\t"
02108 
02109                         ".loop2_pass0:              \n\t"
02110                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02111                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02112                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
02113                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
02114                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
02115                            "movq %%mm0, (%%edi)     \n\t"
02116                            "movq %%mm0, 8(%%edi)    \n\t"
02117                            "movq %%mm1, 16(%%edi)   \n\t"
02118                            "subl $4, %%esi          \n\t"
02119                            "movq %%mm1, 24(%%edi)   \n\t"
02120                            "subl $32, %%edi         \n\t"
02121                            "subl $2, %%ecx          \n\t"
02122                            "jnz .loop2_pass0        \n\t"
02123                            "EMMS                    \n\t" // DONE
02124 
02125                            : "=c" (dummy_value_c),        // output regs (dummy)
02126                              "=S" (dummy_value_S),
02127                              "=D" (dummy_value_D)
02128 
02129                            : "1" (sptr),      // esi      // input regs
02130                              "2" (dp),        // edi
02131                              "0" (width_mmx)  // ecx
02132 
02133 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02134                            : "%mm0", "%mm1"               // clobber list
02135 #endif
02136                         );
02137                      }
02138 
02139                      sptr -= (width_mmx*2 - 2); // sign fixed
02140                      dp -= (width_mmx*16 - 2);  // sign fixed
02141                      for (i = width; i; i--)
02142                      {
02143                         png_byte v[8];
02144                         int j;
02145                         sptr -= 2;
02146                         png_memcpy(v, sptr, 2);
02147                         for (j = 0; j < png_pass_inc[pass]; j++)
02148                         {
02149                            dp -= 2;
02150                            png_memcpy(dp, v, 2);
02151                         }
02152                      }
02153                   }
02154                   else if (((pass == 2) || (pass == 3)) && width)
02155                   {
02156                      int width_mmx = ((width >> 1) << 1) ;
02157                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02158                      if (width_mmx)
02159                      {
02160                         int dummy_value_c;  // fix 'forbidden register spilled'
02161                         int dummy_value_S;
02162                         int dummy_value_D;
02163 
02164                         __asm__ __volatile__ (
02165                            "subl $2, %%esi          \n\t"
02166                            "subl $14, %%edi         \n\t"
02167 
02168                         ".loop2_pass2:              \n\t"
02169                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02170                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02171                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
02172                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
02173                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
02174                            "movq %%mm0, (%%edi)     \n\t"
02175                            "subl $4, %%esi          \n\t"
02176                            "movq %%mm1, 8(%%edi)    \n\t"
02177                            "subl $16, %%edi         \n\t"
02178                            "subl $2, %%ecx          \n\t"
02179                            "jnz .loop2_pass2        \n\t"
02180                            "EMMS                    \n\t" // DONE
02181 
02182                            : "=c" (dummy_value_c),        // output regs (dummy)
02183                              "=S" (dummy_value_S),
02184                              "=D" (dummy_value_D)
02185 
02186                            : "1" (sptr),      // esi      // input regs
02187                              "2" (dp),        // edi
02188                              "0" (width_mmx)  // ecx
02189 
02190 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02191                            : "%mm0", "%mm1"               // clobber list
02192 #endif
02193                         );
02194                      }
02195 
02196                      sptr -= (width_mmx*2 - 2); // sign fixed
02197                      dp -= (width_mmx*8 - 2);   // sign fixed
02198                      for (i = width; i; i--)
02199                      {
02200                         png_byte v[8];
02201                         int j;
02202                         sptr -= 2;
02203                         png_memcpy(v, sptr, 2);
02204                         for (j = 0; j < png_pass_inc[pass]; j++)
02205                         {
02206                            dp -= 2;
02207                            png_memcpy(dp, v, 2);
02208                         }
02209                      }
02210                   }
02211                   else if (width)  // pass == 4 or 5
02212                   {
02213                      int width_mmx = ((width >> 1) << 1) ;
02214                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
02215                      if (width_mmx)
02216                      {
02217                         int dummy_value_c;  // fix 'forbidden register spilled'
02218                         int dummy_value_S;
02219                         int dummy_value_D;
02220 
02221                         __asm__ __volatile__ (
02222                            "subl $2, %%esi          \n\t"
02223                            "subl $6, %%edi          \n\t"
02224 
02225                         ".loop2_pass4:              \n\t"
02226                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
02227                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
02228                            "subl $4, %%esi          \n\t"
02229                            "movq %%mm0, (%%edi)     \n\t"
02230                            "subl $8, %%edi          \n\t"
02231                            "subl $2, %%ecx          \n\t"
02232                            "jnz .loop2_pass4        \n\t"
02233                            "EMMS                    \n\t" // DONE
02234 
02235                            : "=c" (dummy_value_c),        // output regs (dummy)
02236                              "=S" (dummy_value_S),
02237                              "=D" (dummy_value_D)
02238 
02239                            : "1" (sptr),      // esi      // input regs
02240                              "2" (dp),        // edi
02241                              "0" (width_mmx)  // ecx
02242 
02243 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02244                            : "%mm0"                       // clobber list
02245 #endif
02246                         );
02247                      }
02248 
02249                      sptr -= (width_mmx*2 - 2); // sign fixed
02250                      dp -= (width_mmx*4 - 2);   // sign fixed
02251                      for (i = width; i; i--)
02252                      {
02253                         png_byte v[8];
02254                         int j;
02255                         sptr -= 2;
02256                         png_memcpy(v, sptr, 2);
02257                         for (j = 0; j < png_pass_inc[pass]; j++)
02258                         {
02259                            dp -= 2;
02260                            png_memcpy(dp, v, 2);
02261                         }
02262                      }
02263                   }
02264                } /* end of pixel_bytes == 2 */
02265 
02266                //--------------------------------------------------------------
02267                else if (pixel_bytes == 4)
02268                {
02269                   if (((pass == 0) || (pass == 1)) && width)
02270                   {
02271                      int width_mmx = ((width >> 1) << 1);
02272                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02273                      if (width_mmx)
02274                      {
02275                         int dummy_value_c;  // fix 'forbidden register spilled'
02276                         int dummy_value_S;
02277                         int dummy_value_D;
02278 
02279                         __asm__ __volatile__ (
02280                            "subl $4, %%esi          \n\t"
02281                            "subl $60, %%edi         \n\t"
02282 
02283                         ".loop4_pass0:              \n\t"
02284                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02285                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02286                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02287                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02288                            "movq %%mm0, (%%edi)     \n\t"
02289                            "movq %%mm0, 8(%%edi)    \n\t"
02290                            "movq %%mm0, 16(%%edi)   \n\t"
02291                            "movq %%mm0, 24(%%edi)   \n\t"
02292                            "movq %%mm1, 32(%%edi)   \n\t"
02293                            "movq %%mm1, 40(%%edi)   \n\t"
02294                            "movq %%mm1, 48(%%edi)   \n\t"
02295                            "subl $8, %%esi          \n\t"
02296                            "movq %%mm1, 56(%%edi)   \n\t"
02297                            "subl $64, %%edi         \n\t"
02298                            "subl $2, %%ecx          \n\t"
02299                            "jnz .loop4_pass0        \n\t"
02300                            "EMMS                    \n\t" // DONE
02301 
02302                            : "=c" (dummy_value_c),        // output regs (dummy)
02303                              "=S" (dummy_value_S),
02304                              "=D" (dummy_value_D)
02305 
02306                            : "1" (sptr),      // esi      // input regs
02307                              "2" (dp),        // edi
02308                              "0" (width_mmx)  // ecx
02309 
02310 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02311                            : "%mm0", "%mm1"               // clobber list
02312 #endif
02313                         );
02314                      }
02315 
02316                      sptr -= (width_mmx*4 - 4); // sign fixed
02317                      dp -= (width_mmx*32 - 4);  // sign fixed
02318                      for (i = width; i; i--)
02319                      {
02320                         png_byte v[8];
02321                         int j;
02322                         sptr -= 4;
02323                         png_memcpy(v, sptr, 4);
02324                         for (j = 0; j < png_pass_inc[pass]; j++)
02325                         {
02326                            dp -= 4;
02327                            png_memcpy(dp, v, 4);
02328                         }
02329                      }
02330                   }
02331                   else if (((pass == 2) || (pass == 3)) && width)
02332                   {
02333                      int width_mmx = ((width >> 1) << 1);
02334                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02335                      if (width_mmx)
02336                      {
02337                         int dummy_value_c;  // fix 'forbidden register spilled'
02338                         int dummy_value_S;
02339                         int dummy_value_D;
02340 
02341                         __asm__ __volatile__ (
02342                            "subl $4, %%esi          \n\t"
02343                            "subl $28, %%edi         \n\t"
02344 
02345                         ".loop4_pass2:              \n\t"
02346                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02347                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02348                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02349                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02350                            "movq %%mm0, (%%edi)     \n\t"
02351                            "movq %%mm0, 8(%%edi)    \n\t"
02352                            "movq %%mm1, 16(%%edi)   \n\t"
02353                            "movq %%mm1, 24(%%edi)   \n\t"
02354                            "subl $8, %%esi          \n\t"
02355                            "subl $32, %%edi         \n\t"
02356                            "subl $2, %%ecx          \n\t"
02357                            "jnz .loop4_pass2        \n\t"
02358                            "EMMS                    \n\t" // DONE
02359 
02360                            : "=c" (dummy_value_c),        // output regs (dummy)
02361                              "=S" (dummy_value_S),
02362                              "=D" (dummy_value_D)
02363 
02364                            : "1" (sptr),      // esi      // input regs
02365                              "2" (dp),        // edi
02366                              "0" (width_mmx)  // ecx
02367 
02368 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02369                            : "%mm0", "%mm1"               // clobber list
02370 #endif
02371                         );
02372                      }
02373 
02374                      sptr -= (width_mmx*4 - 4); // sign fixed
02375                      dp -= (width_mmx*16 - 4);  // sign fixed
02376                      for (i = width; i; i--)
02377                      {
02378                         png_byte v[8];
02379                         int j;
02380                         sptr -= 4;
02381                         png_memcpy(v, sptr, 4);
02382                         for (j = 0; j < png_pass_inc[pass]; j++)
02383                         {
02384                            dp -= 4;
02385                            png_memcpy(dp, v, 4);
02386                         }
02387                      }
02388                   }
02389                   else if (width)  // pass == 4 or 5
02390                   {
02391                      int width_mmx = ((width >> 1) << 1) ;
02392                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
02393                      if (width_mmx)
02394                      {
02395                         int dummy_value_c;  // fix 'forbidden register spilled'
02396                         int dummy_value_S;
02397                         int dummy_value_D;
02398 
02399                         __asm__ __volatile__ (
02400                            "subl $4, %%esi          \n\t"
02401                            "subl $12, %%edi         \n\t"
02402 
02403                         ".loop4_pass4:              \n\t"
02404                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02405                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
02406                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
02407                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
02408                            "movq %%mm0, (%%edi)     \n\t"
02409                            "subl $8, %%esi          \n\t"
02410                            "movq %%mm1, 8(%%edi)    \n\t"
02411                            "subl $16, %%edi         \n\t"
02412                            "subl $2, %%ecx          \n\t"
02413                            "jnz .loop4_pass4        \n\t"
02414                            "EMMS                    \n\t" // DONE
02415 
02416                            : "=c" (dummy_value_c),        // output regs (dummy)
02417                              "=S" (dummy_value_S),
02418                              "=D" (dummy_value_D)
02419 
02420                            : "1" (sptr),      // esi      // input regs
02421                              "2" (dp),        // edi
02422                              "0" (width_mmx)  // ecx
02423 
02424 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
02425                            : "%mm0", "%mm1"               // clobber list
02426 #endif
02427                         );
02428                      }
02429 
02430                      sptr -= (width_mmx*4 - 4); // sign fixed
02431                      dp -= (width_mmx*8 - 4);   // sign fixed
02432                      for (i = width; i; i--)
02433                      {
02434                         png_byte v[8];
02435                         int j;
02436                         sptr -= 4;
02437                         png_memcpy(v, sptr, 4);
02438                         for (j = 0; j < png_pass_inc[pass]; j++)
02439                         {
02440                            dp -= 4;
02441                            png_memcpy(dp, v, 4);
02442                         }
02443                      }
02444                   }
02445                } /* end of pixel_bytes == 4 */
02446 
02447                //--------------------------------------------------------------
02448                else if (pixel_bytes == 8)
02449                {
02450 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
02451                   // GRR NOTE:  no need to combine passes here!
02452                   if (((pass == 0) || (pass == 1)) && width)
02453                   {
02454                      int dummy_value_c;  // fix 'forbidden register spilled'
02455                      int dummy_value_S;
02456                      int dummy_value_D;
02457 
02458                      // source is 8-byte RRGGBBAA
02459                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
02460                      __asm__ __volatile__ (
02461                         "subl $56, %%edi         \n\t" // start of last block
02462 
02463                      ".loop8_pass0:              \n\t"
02464                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02465                         "movq %%mm0, (%%edi)     \n\t"
02466                         "movq %%mm0, 8(%%edi)    \n\t"
02467                         "movq %%mm0, 16(%%edi)   \n\t"
02468                         "movq %%mm0, 24(%%edi)   \n\t"
02469                         "movq %%mm0, 32(%%edi)   \n\t"
02470                         "movq %%mm0, 40(%%edi)   \n\t"
02471                         "movq %%mm0, 48(%%edi)   \n\t"
02472                         "subl $8, %%esi          \n\t"
02473                         "movq %%mm0, 56(%%edi)   \n\t"
02474                         "subl $64, %%edi         \n\t"
02475                         "decl %%ecx              \n\t"
02476                         "jnz .loop8_pass0        \n\t"
02477                         "EMMS                    \n\t" // DONE
02478 
02479                         : "=c" (dummy_value_c),        // output regs (dummy)
02480                           "=S" (dummy_value_S),
02481                           "=D" (dummy_value_D)
02482 
02483                         : "1" (sptr),      // esi      // input regs
02484                           "2" (dp),        // edi
02485                           "0" (width)      // ecx
02486 
02487 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02488                         : "%mm0"                       // clobber list
02489 #endif
02490                      );
02491                   }
02492                   else if (((pass == 2) || (pass == 3)) && width)
02493                   {
02494                      // source is 8-byte RRGGBBAA
02495                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
02496                      // (recall that expansion is _in place_:  sptr and dp
02497                      //  both point at locations within same row buffer)
02498                      {
02499                         int dummy_value_c;  // fix 'forbidden register spilled'
02500                         int dummy_value_S;
02501                         int dummy_value_D;
02502 
02503                         __asm__ __volatile__ (
02504                            "subl $24, %%edi         \n\t" // start of last block
02505 
02506                         ".loop8_pass2:              \n\t"
02507                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02508                            "movq %%mm0, (%%edi)     \n\t"
02509                            "movq %%mm0, 8(%%edi)    \n\t"
02510                            "movq %%mm0, 16(%%edi)   \n\t"
02511                            "subl $8, %%esi          \n\t"
02512                            "movq %%mm0, 24(%%edi)   \n\t"
02513                            "subl $32, %%edi         \n\t"
02514                            "decl %%ecx              \n\t"
02515                            "jnz .loop8_pass2        \n\t"
02516                            "EMMS                    \n\t" // DONE
02517 
02518                            : "=c" (dummy_value_c),        // output regs (dummy)
02519                              "=S" (dummy_value_S),
02520                              "=D" (dummy_value_D)
02521 
02522                            : "1" (sptr),      // esi      // input regs
02523                              "2" (dp),        // edi
02524                              "0" (width)      // ecx
02525 
02526 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02527                            : "%mm0"                       // clobber list
02528 #endif
02529                         );
02530                      }
02531                   }
02532                   else if (width)  // pass == 4 or 5
02533                   {
02534                      // source is 8-byte RRGGBBAA
02535                      // dest is 16-byte RRGGBBAA RRGGBBAA
02536                      {
02537                         int dummy_value_c;  // fix 'forbidden register spilled'
02538                         int dummy_value_S;
02539                         int dummy_value_D;
02540 
02541                         __asm__ __volatile__ (
02542                            "subl $8, %%edi          \n\t" // start of last block
02543 
02544                         ".loop8_pass4:              \n\t"
02545                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
02546                            "movq %%mm0, (%%edi)     \n\t"
02547                            "subl $8, %%esi          \n\t"
02548                            "movq %%mm0, 8(%%edi)    \n\t"
02549                            "subl $16, %%edi         \n\t"
02550                            "decl %%ecx              \n\t"
02551                            "jnz .loop8_pass4        \n\t"
02552                            "EMMS                    \n\t" // DONE
02553 
02554                            : "=c" (dummy_value_c),        // output regs (dummy)
02555                              "=S" (dummy_value_S),
02556                              "=D" (dummy_value_D)
02557 
02558                            : "1" (sptr),      // esi      // input regs
02559                              "2" (dp),        // edi
02560                              "0" (width)      // ecx
02561 
02562 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
02563                            : "%mm0"                       // clobber list
02564 #endif
02565                         );
02566                      }
02567                   }
02568 
02569                } /* end of pixel_bytes == 8 */
02570 
02571                //--------------------------------------------------------------
02572                else if (pixel_bytes == 6)
02573                {
02574                   for (i = width; i; i--)
02575                   {
02576                      png_byte v[8];
02577                      int j;
02578                      png_memcpy(v, sptr, 6);
02579                      for (j = 0; j < png_pass_inc[pass]; j++)
02580                      {
02581                         png_memcpy(dp, v, 6);
02582                         dp -= 6;
02583                      }
02584                      sptr -= 6;
02585                   }
02586                } /* end of pixel_bytes == 6 */
02587 
02588                //--------------------------------------------------------------
02589                else
02590                {
02591                   for (i = width; i; i--)
02592                   {
02593                      png_byte v[8];
02594                      int j;
02595                      png_memcpy(v, sptr, pixel_bytes);
02596                      for (j = 0; j < png_pass_inc[pass]; j++)
02597                      {
02598                         png_memcpy(dp, v, pixel_bytes);
02599                         dp -= pixel_bytes;
02600                      }
02601                      sptr-= pixel_bytes;
02602                   }
02603                }
02604             } // end of _mmx_supported ========================================
02605 
02606             else /* MMX not supported:  use modified C code - takes advantage
02607                   *   of inlining of png_memcpy for a constant */
02608                  /* GRR 19991007:  does it?  or should pixel_bytes in each
02609                   *   block be replaced with immediate value (e.g., 1)? */
02610                  /* GRR 19991017:  replaced with constants in each case */
02611 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
02612             {
02613                if (pixel_bytes == 1)
02614                {
02615                   for (i = width; i; i--)
02616                   {
02617                      int j;
02618                      for (j = 0; j < png_pass_inc[pass]; j++)
02619                      {
02620                         *dp-- = *sptr;
02621                      }
02622                      --sptr;
02623                   }
02624                }
02625                else if (pixel_bytes == 3)
02626                {
02627                   for (i = width; i; i--)
02628                   {
02629                      png_byte v[8];
02630                      int j;
02631                      png_memcpy(v, sptr, 3);
02632                      for (j = 0; j < png_pass_inc[pass]; j++)
02633                      {
02634                         png_memcpy(dp, v, 3);
02635                         dp -= 3;
02636                      }
02637                      sptr -= 3;
02638                   }
02639                }
02640                else if (pixel_bytes == 2)
02641                {
02642                   for (i = width; i; i--)
02643                   {
02644                      png_byte v[8];
02645                      int j;
02646                      png_memcpy(v, sptr, 2);
02647                      for (j = 0; j < png_pass_inc[pass]; j++)
02648                      {
02649                         png_memcpy(dp, v, 2);
02650                         dp -= 2;
02651                      }
02652                      sptr -= 2;
02653                   }
02654                }
02655                else if (pixel_bytes == 4)
02656                {
02657                   for (i = width; i; i--)
02658                   {
02659                      png_byte v[8];
02660                      int j;
02661                      png_memcpy(v, sptr, 4);
02662                      for (j = 0; j < png_pass_inc[pass]; j++)
02663                      {
02664 #ifdef PNG_DEBUG
02665                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
02666                         {
02667                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
02668                              row, dp, row+png_ptr->row_buf_size);
02669                            printf("row_buf=%d\n",png_ptr->row_buf_size);
02670                         }
02671 #endif
02672                         png_memcpy(dp, v, 4);
02673                         dp -= 4;
02674                      }
02675                      sptr -= 4;
02676                   }
02677                }
02678                else if (pixel_bytes == 6)
02679                {
02680                   for (i = width; i; i--)
02681                   {
02682                      png_byte v[8];
02683                      int j;
02684                      png_memcpy(v, sptr, 6);
02685                      for (j = 0; j < png_pass_inc[pass]; j++)
02686                      {
02687                         png_memcpy(dp, v, 6);
02688                         dp -= 6;
02689                      }
02690                      sptr -= 6;
02691                   }
02692                }
02693                else if (pixel_bytes == 8)
02694                {
02695                   for (i = width; i; i--)
02696                   {
02697                      png_byte v[8];
02698                      int j;
02699                      png_memcpy(v, sptr, 8);
02700                      for (j = 0; j < png_pass_inc[pass]; j++)
02701                      {
02702                         png_memcpy(dp, v, 8);
02703                         dp -= 8;
02704                      }
02705                      sptr -= 8;
02706                   }
02707                }
02708                else     /* GRR:  should never be reached */
02709                {
02710                   for (i = width; i; i--)
02711                   {
02712                      png_byte v[8];
02713                      int j;
02714                      png_memcpy(v, sptr, pixel_bytes);
02715                      for (j = 0; j < png_pass_inc[pass]; j++)
02716                      {
02717                         png_memcpy(dp, v, pixel_bytes);
02718                         dp -= pixel_bytes;
02719                      }
02720                      sptr -= pixel_bytes;
02721                   }
02722                }
02723 
02724             } /* end if (MMX not supported) */
02725             break;
02726          }
02727       } /* end switch (row_info->pixel_depth) */
02728 
02729       row_info->width = final_width;
02730       row_info->rowbytes = ((final_width *
02731          (png_uint_32)row_info->pixel_depth + 7) >> 3);
02732    }
02733 
02734 } /* end png_do_read_interlace() */
02735 
02736 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
02737 #endif /* PNG_READ_INTERLACING_SUPPORTED */
02738 
02739 
02740 
02741 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
02742 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
02743 
02744 // These variables are utilized in the functions below.  They are declared
02745 // globally here to ensure alignment on 8-byte boundaries.
02746 
02747 union uAll {
02748    long long use;
02749    double  align;
02750 } _LBCarryMask = {0x0101010101010101LL},
02751   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
02752   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
02753 
02754 #ifdef PNG_THREAD_UNSAFE_OK
02755 //===========================================================================//
02756 //                                                                           //
02757 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
02758 //                                                                           //
02759 //===========================================================================//
02760 
02761 // Optimized code for PNG Average filter decoder
02762 
02763 static void /* PRIVATE */
02764 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
02765                             png_bytep prev_row)
02766 {
02767    int bpp;
02768    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
02769    int dummy_value_S;
02770    int dummy_value_D;
02771 
02772    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
02773    _FullLength  = row_info->rowbytes;       // # of bytes to filter
02774 
02775    __asm__ __volatile__ (
02776       // initialize address pointers and offset
02777 #ifdef __PIC__
02778       "pushl %%ebx                 \n\t" // save index to Global Offset Table
02779 #endif
02780 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
02781       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
02782       "movl %%edi, %%edx           \n\t"
02783 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
02784 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
02785       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
02786 
02787       "xorl %%eax,%%eax            \n\t"
02788 
02789       // Compute the Raw value for the first bpp bytes
02790       //    Raw(x) = Avg(x) + (Prior(x)/2)
02791    "avg_rlp:                       \n\t"
02792       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
02793       "incl %%ebx                  \n\t"
02794       "shrb %%al                   \n\t" // divide by 2
02795       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
02796 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
02797       "cmpl %%ecx, %%ebx           \n\t"
02798       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
02799       "jb avg_rlp                  \n\t" // mov does not affect flags
02800 
02801       // get # of bytes to alignment
02802       "movl %%edi, _dif            \n\t" // take start of row
02803       "addl %%ebx, _dif            \n\t" // add bpp
02804       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
02805       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
02806       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
02807       "jz avg_go                   \n\t" //  alignment
02808 
02809       // fix alignment
02810       // Compute the Raw value for the bytes up to the alignment boundary
02811       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
02812       "xorl %%ecx, %%ecx           \n\t"
02813 
02814    "avg_lp1:                       \n\t"
02815       "xorl %%eax, %%eax           \n\t"
02816       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
02817       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
02818       "addw %%cx, %%ax             \n\t"
02819       "incl %%ebx                  \n\t"
02820       "shrw %%ax                   \n\t" // divide by 2
02821       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
02822       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
02823       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
02824       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
02825 
02826    "avg_go:                        \n\t"
02827       "movl _FullLength, %%eax     \n\t"
02828       "movl %%eax, %%ecx           \n\t"
02829       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
02830       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
02831       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
02832       "movl %%ecx, _MMXLength      \n\t"
02833 #ifdef __PIC__
02834       "popl %%ebx                  \n\t" // restore index to Global Offset Table
02835 #endif
02836 
02837       : "=c" (dummy_value_c),            // output regs (dummy)
02838         "=S" (dummy_value_S),
02839         "=D" (dummy_value_D)
02840 
02841       : "0" (bpp),       // ecx          // input regs
02842         "1" (prev_row),  // esi
02843         "2" (row)        // edi
02844 
02845       : "%eax", "%edx"                   // clobber list
02846 #ifndef __PIC__
02847       , "%ebx"
02848 #endif
02849       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
02850       // (seems to work fine without...)
02851    );
02852 
02853    // now do the math for the rest of the row
02854    switch (bpp)
02855    {
02856       case 3:
02857       {
02858          _ActiveMask.use  = 0x0000000000ffffffLL;
02859          _ShiftBpp.use = 24;    // == 3 * 8
02860          _ShiftRem.use = 40;    // == 64 - 24
02861 
02862          __asm__ __volatile__ (
02863             // re-init address pointers and offset
02864             "movq _ActiveMask, %%mm7      \n\t"
02865             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
02866             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
02867 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
02868             "movq _HBClearMask, %%mm4     \n\t"
02869 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
02870 
02871             // prime the pump:  load the first Raw(x-bpp) data set
02872             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
02873                                                 // (correct pos. in loop below)
02874          "avg_3lp:                        \n\t"
02875             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
02876             "movq %%mm5, %%mm3            \n\t"
02877             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
02878                                                 // data
02879             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
02880             "movq %%mm7, %%mm6            \n\t"
02881             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
02882             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
02883             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
02884                                                 // byte
02885             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
02886                                                 // each byte
02887             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
02888             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02889                                                 // LBCarrys
02890             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02891                                                 // where both
02892                                // lsb's were == 1 (only valid for active group)
02893             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02894             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02895                                                 // byte
02896             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02897                                                 // for each byte
02898             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
02899                                                 // bytes to add to Avg
02900             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02901                                                 // Avg for each Active
02902                                //  byte
02903             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
02904             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
02905                                                 // bytes 3-5
02906             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
02907             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
02908             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02909                                                 // LBCarrys
02910             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02911                                                 // where both
02912                                // lsb's were == 1 (only valid for active group)
02913             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02914             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02915                                                 // byte
02916             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02917                                                 // for each byte
02918             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
02919                                                 // bytes to add to Avg
02920             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02921                                                 // Avg for each Active
02922                                //  byte
02923 
02924             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
02925             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
02926                                                 // two
02927                                  // bytes
02928             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
02929             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
02930                               // Data only needs to be shifted once here to
02931                               // get the correct x-bpp offset.
02932             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
02933                                                 // LBCarrys
02934             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
02935                                                 // where both
02936                               // lsb's were == 1 (only valid for active group)
02937             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
02938             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
02939                                                 // byte
02940             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
02941                                                 // for each byte
02942             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
02943                                                 // bytes to add to Avg
02944             "addl $8, %%ecx               \n\t"
02945             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
02946                                                 // Avg for each Active
02947                                                 // byte
02948             // now ready to write back to memory
02949             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
02950             // move updated Raw(x) to use as Raw(x-bpp) for next loop
02951             "cmpl _MMXLength, %%ecx       \n\t"
02952             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
02953             "jb avg_3lp                   \n\t"
02954 
02955             : "=S" (dummy_value_S),             // output regs (dummy)
02956               "=D" (dummy_value_D)
02957 
02958             : "0" (prev_row),  // esi           // input regs
02959               "1" (row)        // edi
02960 
02961             : "%ecx"                            // clobber list
02962 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
02963             , "%mm0", "%mm1", "%mm2", "%mm3"
02964             , "%mm4", "%mm5", "%mm6", "%mm7"
02965 #endif
02966          );
02967       }
02968       break;  // end 3 bpp
02969 
02970       case 6:
02971       case 4:
02972       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
02973       //case 5:   // GRR BOGUS
02974       {
02975          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
02976                                                   // appropriate inactive bytes
02977          _ShiftBpp.use = bpp << 3;
02978          _ShiftRem.use = 64 - _ShiftBpp.use;
02979 
02980          __asm__ __volatile__ (
02981             "movq _HBClearMask, %%mm4    \n\t"
02982 
02983             // re-init address pointers and offset
02984             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
02985                                                // alignment boundary
02986 
02987             // load _ActiveMask and clear all bytes except for 1st active group
02988             "movq _ActiveMask, %%mm7     \n\t"
02989 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
02990             "psrlq _ShiftRem, %%mm7      \n\t"
02991 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
02992             "movq %%mm7, %%mm6           \n\t"
02993             "movq _LBCarryMask, %%mm5    \n\t"
02994             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
02995                                                // group
02996 
02997             // prime the pump:  load the first Raw(x-bpp) data set
02998             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
02999                                           // (we correct pos. in loop below)
03000          "avg_4lp:                       \n\t"
03001             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03002             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
03003             "movq (%%esi,%%ecx,), %%mm1  \n\t"
03004             // add (Prev_row/2) to average
03005             "movq %%mm5, %%mm3           \n\t"
03006             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03007             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03008             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03009                                                // byte
03010             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03011                                                // each byte
03012             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
03013             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03014                                                // LBCarrys
03015             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03016                                                // where both
03017                               // lsb's were == 1 (only valid for active group)
03018             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03019             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03020                                                // byte
03021             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03022                                                // for each byte
03023             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
03024                                                // bytes to add to Avg
03025             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
03026                                                // for each Active
03027                               // byte
03028             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
03029             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03030             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03031             "addl $8, %%ecx              \n\t"
03032             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03033                                                // LBCarrys
03034             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03035                                                // where both
03036                               // lsb's were == 1 (only valid for active group)
03037             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03038             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03039                                                // byte
03040             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03041                                                // for each byte
03042             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03043                                                // bytes to add to Avg
03044             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03045                                                // Avg for each Active
03046                               // byte
03047             "cmpl _MMXLength, %%ecx      \n\t"
03048             // now ready to write back to memory
03049             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03050             // prep Raw(x-bpp) for next loop
03051             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03052             "jb avg_4lp                  \n\t"
03053 
03054             : "=S" (dummy_value_S),            // output regs (dummy)
03055               "=D" (dummy_value_D)
03056 
03057             : "0" (prev_row),  // esi          // input regs
03058               "1" (row)        // edi
03059 
03060             : "%ecx"                           // clobber list
03061 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03062             , "%mm0", "%mm1", "%mm2", "%mm3"
03063             , "%mm4", "%mm5", "%mm6", "%mm7"
03064 #endif
03065          );
03066       }
03067       break;  // end 4,6 bpp
03068 
03069       case 2:
03070       {
03071          _ActiveMask.use  = 0x000000000000ffffLL;
03072          _ShiftBpp.use = 16;   // == 2 * 8
03073          _ShiftRem.use = 48;   // == 64 - 16
03074 
03075          __asm__ __volatile__ (
03076             // load _ActiveMask
03077             "movq _ActiveMask, %%mm7     \n\t"
03078             // re-init address pointers and offset
03079             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
03080                                                // boundary
03081             "movq _LBCarryMask, %%mm5    \n\t"
03082 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03083             "movq _HBClearMask, %%mm4    \n\t"
03084 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03085 
03086             // prime the pump:  load the first Raw(x-bpp) data set
03087             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
03088                               // (we correct pos. in loop below)
03089          "avg_2lp:                       \n\t"
03090             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03091             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
03092             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
03093             // add (Prev_row/2) to average
03094             "movq %%mm5, %%mm3           \n\t"
03095             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03096             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03097             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03098                                                // byte
03099             "movq %%mm7, %%mm6           \n\t"
03100             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03101                                                // each byte
03102 
03103             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
03104             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03105                                                // LBCarrys
03106             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03107                                                // where both
03108                                                // lsb's were == 1 (only valid
03109                                                // for active group)
03110             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03111             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03112                                                // byte
03113             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03114                                                // for each byte
03115             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
03116                                                // bytes to add to Avg
03117             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
03118                                                // for each Active byte
03119 
03120             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
03121             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03122                                                // bytes 2 & 3
03123             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03124             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03125             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03126                                                // LBCarrys
03127             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03128                                                // where both
03129                                                // lsb's were == 1 (only valid
03130                                                // for active group)
03131             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03132             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03133                                                // byte
03134             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03135                                                // for each byte
03136             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03137                                                // bytes to add to Avg
03138             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03139                                                // Avg for each Active byte
03140 
03141             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
03142             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03143                                                // bytes 4 & 5
03144             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03145             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03146             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03147                                                // LBCarrys
03148             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03149                                                // where both lsb's were == 1
03150                                                // (only valid for active group)
03151             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03152             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03153                                                // byte
03154             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03155                                                // for each byte
03156             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03157                                                // bytes to add to Avg
03158             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03159                                                // Avg for each Active byte
03160 
03161             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
03162             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
03163                                                // bytes 6 & 7
03164             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03165             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
03166             "addl $8, %%ecx              \n\t"
03167             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
03168                                                // LBCarrys
03169             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
03170                                                // where both
03171                                                // lsb's were == 1 (only valid
03172                                                // for active group)
03173             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03174             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03175                                                // byte
03176             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
03177                                                // for each byte
03178             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
03179                                                // bytes to add to Avg
03180             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
03181                                                // Avg for each Active byte
03182 
03183             "cmpl _MMXLength, %%ecx      \n\t"
03184             // now ready to write back to memory
03185             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03186             // prep Raw(x-bpp) for next loop
03187             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
03188             "jb avg_2lp                  \n\t"
03189 
03190             : "=S" (dummy_value_S),            // output regs (dummy)
03191               "=D" (dummy_value_D)
03192 
03193             : "0" (prev_row),  // esi          // input regs
03194               "1" (row)        // edi
03195 
03196             : "%ecx"                           // clobber list
03197 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03198             , "%mm0", "%mm1", "%mm2", "%mm3"
03199             , "%mm4", "%mm5", "%mm6", "%mm7"
03200 #endif
03201          );
03202       }
03203       break;  // end 2 bpp
03204 
03205       case 1:
03206       {
03207          __asm__ __volatile__ (
03208             // re-init address pointers and offset
03209 #ifdef __PIC__
03210             "pushl %%ebx                 \n\t" // save Global Offset Table index
03211 #endif
03212             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
03213                                                // boundary
03214 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03215             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
03216             "jnb avg_1end                \n\t"
03217             // do Paeth decode for remaining bytes
03218 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03219             "movl %%edi, %%edx           \n\t"
03220 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
03221             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
03222             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
03223                                                //  in loop below
03224          "avg_1lp:                       \n\t"
03225             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
03226             "xorl %%eax, %%eax           \n\t"
03227             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
03228             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
03229             "addw %%cx, %%ax             \n\t"
03230             "incl %%ebx                  \n\t"
03231             "shrw %%ax                   \n\t" // divide by 2
03232             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
03233                                                // inc ebx
03234             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
03235             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
03236                          // mov does not affect flags; -1 to offset inc ebx
03237             "jb avg_1lp                  \n\t"
03238 
03239          "avg_1end:                      \n\t"
03240 #ifdef __PIC__
03241             "popl %%ebx                  \n\t" // Global Offset Table index
03242 #endif
03243 
03244             : "=c" (dummy_value_c),            // output regs (dummy)
03245               "=S" (dummy_value_S),
03246               "=D" (dummy_value_D)
03247 
03248             : "0" (bpp),       // ecx          // input regs
03249               "1" (prev_row),  // esi
03250               "2" (row)        // edi
03251 
03252             : "%eax", "%edx"                   // clobber list
03253 #ifndef __PIC__
03254             , "%ebx"
03255 #endif
03256          );
03257       }
03258       return;  // end 1 bpp
03259 
03260       case 8:
03261       {
03262          __asm__ __volatile__ (
03263             // re-init address pointers and offset
03264             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
03265             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
03266 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
03267             "movq _HBClearMask, %%mm4    \n\t"
03268 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03269 
03270             // prime the pump:  load the first Raw(x-bpp) data set
03271             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
03272                                       // (NO NEED to correct pos. in loop below)
03273 
03274          "avg_8lp:                       \n\t"
03275             "movq (%%edi,%%ecx,), %%mm0  \n\t"
03276             "movq %%mm5, %%mm3           \n\t"
03277             "movq (%%esi,%%ecx,), %%mm1  \n\t"
03278             "addl $8, %%ecx              \n\t"
03279             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03280             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03281             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
03282                                                //  where both lsb's were == 1
03283             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03284             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
03285             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
03286             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
03287             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
03288             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
03289             "cmpl _MMXLength, %%ecx      \n\t"
03290             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
03291             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
03292             "jb avg_8lp                  \n\t"
03293 
03294             : "=S" (dummy_value_S),            // output regs (dummy)
03295               "=D" (dummy_value_D)
03296 
03297             : "0" (prev_row),  // esi          // input regs
03298               "1" (row)        // edi
03299 
03300             : "%ecx"                           // clobber list
03301 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
03302             , "%mm0", "%mm1", "%mm2"
03303             , "%mm3", "%mm4", "%mm5"
03304 #endif
03305          );
03306       }
03307       break;  // end 8 bpp
03308 
03309       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
03310       {
03311 
03312 #ifdef PNG_DEBUG
03313          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
03314         png_debug(1,
03315         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
03316 #endif
03317 
03318 #if 0
03319         __asm__ __volatile__ (
03320             "movq _LBCarryMask, %%mm5    \n\t"
03321             // re-init address pointers and offset
03322             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
03323                                                // alignment boundary
03324             "movl row, %%edi             \n\t" // edi:  Avg(x)
03325             "movq _HBClearMask, %%mm4    \n\t"
03326             "movl %%edi, %%edx           \n\t"
03327             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03328             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
03329          "avg_Alp:                       \n\t"
03330             "movq (%%edi,%%ebx,), %%mm0  \n\t"
03331             "movq %%mm5, %%mm3           \n\t"
03332             "movq (%%esi,%%ebx,), %%mm1  \n\t"
03333             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
03334             "movq (%%edx,%%ebx,), %%mm2  \n\t"
03335             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
03336             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
03337                                                // where both lsb's were == 1
03338             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
03339             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
03340                                                // byte
03341             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
03342                                                // byte
03343             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
03344                                                // byte
03345             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
03346                                                // each byte
03347             "addl $8, %%ebx              \n\t"
03348             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
03349                                                // byte
03350             "cmpl _MMXLength, %%ebx      \n\t"
03351             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
03352             "jb avg_Alp                  \n\t"
03353 
03354             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
03355 
03356             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
03357 
03358             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
03359          );
03360 #endif /* 0 - NEVER REACHED */
03361       }
03362       break;
03363 
03364    } // end switch (bpp)
03365 
03366    __asm__ __volatile__ (
03367       // MMX acceleration complete; now do clean-up
03368       // check if any remaining bytes left to decode
03369 #ifdef __PIC__
03370       "pushl %%ebx                 \n\t" // save index to Global Offset Table
03371 #endif
03372       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
03373 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
03374       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
03375       "jnb avg_end                 \n\t"
03376 
03377       // do Avg decode for remaining bytes
03378 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
03379       "movl %%edi, %%edx           \n\t"
03380 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
03381       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
03382       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
03383 
03384    "avg_lp2:                       \n\t"
03385       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
03386       "xorl %%eax, %%eax           \n\t"
03387       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
03388       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
03389       "addw %%cx, %%ax             \n\t"
03390       "incl %%ebx                  \n\t"
03391       "shrw %%ax                   \n\t" // divide by 2
03392       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
03393       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
03394       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
03395       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
03396 
03397    "avg_end:                       \n\t"
03398       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
03399 #ifdef __PIC__
03400       "popl %%ebx                  \n\t" // restore index to Global Offset Table
03401 #endif
03402 
03403       : "=c" (dummy_value_c),            // output regs (dummy)
03404         "=S" (dummy_value_S),
03405         "=D" (dummy_value_D)
03406 
03407       : "0" (bpp),       // ecx          // input regs
03408         "1" (prev_row),  // esi
03409         "2" (row)        // edi
03410 
03411       : "%eax", "%edx"                   // clobber list
03412 #ifndef __PIC__
03413       , "%ebx"
03414 #endif
03415    );
03416 
03417 } /* end png_read_filter_row_mmx_avg() */
03418 #endif
03419 
03420 
03421 
03422 #ifdef PNG_THREAD_UNSAFE_OK
03423 //===========================================================================//
03424 //                                                                           //
03425 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
03426 //                                                                           //
03427 //===========================================================================//
03428 
03429 // Optimized code for PNG Paeth filter decoder
03430 
03431 static void /* PRIVATE */
03432 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
03433                               png_bytep prev_row)
03434 {
03435    int bpp;
03436    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
03437    int dummy_value_S;
03438    int dummy_value_D;
03439 
03440    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
03441    _FullLength  = row_info->rowbytes; // # of bytes to filter
03442 
03443    __asm__ __volatile__ (
03444 #ifdef __PIC__
03445       "pushl %%ebx                 \n\t" // save index to Global Offset Table
03446 #endif
03447       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
03448 //pre "movl row, %%edi             \n\t"
03449       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
03450 //pre "movl prev_row, %%esi        \n\t"
03451       "xorl %%eax, %%eax           \n\t"
03452 
03453       // Compute the Raw value for the first bpp bytes
03454       // Note: the formula works out to be always
03455       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
03456    "paeth_rlp:                     \n\t"
03457       "movb (%%edi,%%ebx,), %%al   \n\t"
03458       "addb (%%esi,%%ebx,), %%al   \n\t"
03459       "incl %%ebx                  \n\t"
03460 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
03461       "cmpl %%ecx, %%ebx           \n\t"
03462       "movb %%al, -1(%%edi,%%ebx,) \n\t"
03463       "jb paeth_rlp                \n\t"
03464       // get # of bytes to alignment
03465       "movl %%edi, _dif            \n\t" // take start of row
03466       "addl %%ebx, _dif            \n\t" // add bpp
03467       "xorl %%ecx, %%ecx           \n\t"
03468       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
03469                                          // boundary
03470       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
03471       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
03472                                          // at alignment
03473       "jz paeth_go                 \n\t"
03474       // fix alignment
03475 
03476    "paeth_lp1:                     \n\t"
03477       "xorl %%eax, %%eax           \n\t"
03478       // pav = p - a = (a + b - c) - a = b - c
03479       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
03480       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03481       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
03482       "movl %%eax, _patemp         \n\t" // Save pav for later use
03483       "xorl %%eax, %%eax           \n\t"
03484       // pbv = p - b = (a + b - c) - b = a - c
03485       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
03486       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
03487       "movl %%eax, %%ecx           \n\t"
03488       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03489       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
03490       // pc = abs(pcv)
03491       "testl $0x80000000, %%eax    \n\t"
03492       "jz paeth_pca                \n\t"
03493       "negl %%eax                  \n\t" // reverse sign of neg values
03494 
03495    "paeth_pca:                     \n\t"
03496       "movl %%eax, _pctemp         \n\t" // save pc for later use
03497       // pb = abs(pbv)
03498       "testl $0x80000000, %%ecx    \n\t"
03499       "jz paeth_pba                \n\t"
03500       "negl %%ecx                  \n\t" // reverse sign of neg values
03501 
03502    "paeth_pba:                     \n\t"
03503       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
03504       // pa = abs(pav)
03505       "movl _patemp, %%eax         \n\t"
03506       "testl $0x80000000, %%eax    \n\t"
03507       "jz paeth_paa                \n\t"
03508       "negl %%eax                  \n\t" // reverse sign of neg values
03509 
03510    "paeth_paa:                     \n\t"
03511       "movl %%eax, _patemp         \n\t" // save pa for later use
03512       // test if pa <= pb
03513       "cmpl %%ecx, %%eax           \n\t"
03514       "jna paeth_abb               \n\t"
03515       // pa > pb; now test if pb <= pc
03516       "cmpl _pctemp, %%ecx         \n\t"
03517       "jna paeth_bbc               \n\t"
03518       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03519       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03520       "jmp paeth_paeth             \n\t"
03521 
03522    "paeth_bbc:                     \n\t"
03523       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
03524       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
03525       "jmp paeth_paeth             \n\t"
03526 
03527    "paeth_abb:                     \n\t"
03528       // pa <= pb; now test if pa <= pc
03529       "cmpl _pctemp, %%eax         \n\t"
03530       "jna paeth_abc               \n\t"
03531       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03532       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
03533       "jmp paeth_paeth             \n\t"
03534 
03535    "paeth_abc:                     \n\t"
03536       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
03537       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
03538 
03539    "paeth_paeth:                   \n\t"
03540       "incl %%ebx                  \n\t"
03541       "incl %%edx                  \n\t"
03542       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
03543       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
03544       "cmpl _dif, %%ebx            \n\t"
03545       "jb paeth_lp1                \n\t"
03546 
03547    "paeth_go:                      \n\t"
03548       "movl _FullLength, %%ecx     \n\t"
03549       "movl %%ecx, %%eax           \n\t"
03550       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
03551       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
03552       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
03553       "movl %%ecx, _MMXLength      \n\t"
03554 #ifdef __PIC__
03555       "popl %%ebx                  \n\t" // restore index to Global Offset Table
03556 #endif
03557 
03558       : "=c" (dummy_value_c),            // output regs (dummy)
03559         "=S" (dummy_value_S),
03560         "=D" (dummy_value_D)
03561 
03562       : "0" (bpp),       // ecx          // input regs
03563         "1" (prev_row),  // esi
03564         "2" (row)        // edi
03565 
03566       : "%eax", "%edx"                   // clobber list
03567 #ifndef __PIC__
03568       , "%ebx"
03569 #endif
03570    );
03571 
03572    // now do the math for the rest of the row
03573    switch (bpp)
03574    {
03575       case 3:
03576       {
03577          _ActiveMask.use = 0x0000000000ffffffLL;
03578          _ActiveMaskEnd.use = 0xffff000000000000LL;
03579          _ShiftBpp.use = 24;    // == bpp(3) * 8
03580          _ShiftRem.use = 40;    // == 64 - 24
03581 
03582          __asm__ __volatile__ (
03583             "movl _dif, %%ecx            \n\t"
03584 // preload  "movl row, %%edi             \n\t"
03585 // preload  "movl prev_row, %%esi        \n\t"
03586             "pxor %%mm0, %%mm0           \n\t"
03587             // prime the pump:  load the first Raw(x-bpp) data set
03588             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03589          "paeth_3lp:                     \n\t"
03590             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
03591                                                // 3 bytes
03592             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03593             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03594             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
03595             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03596             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
03597                                                // 3 bytes
03598             // pav = p - a = (a + b - c) - a = b - c
03599             "movq %%mm2, %%mm4           \n\t"
03600             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03601             // pbv = p - b = (a + b - c) - b = a - c
03602             "movq %%mm1, %%mm5           \n\t"
03603             "psubw %%mm3, %%mm4          \n\t"
03604             "pxor %%mm7, %%mm7           \n\t"
03605             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03606             "movq %%mm4, %%mm6           \n\t"
03607             "psubw %%mm3, %%mm5          \n\t"
03608 
03609             // pa = abs(p-a) = abs(pav)
03610             // pb = abs(p-b) = abs(pbv)
03611             // pc = abs(p-c) = abs(pcv)
03612             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03613             "paddw %%mm5, %%mm6          \n\t"
03614             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03615             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03616             "psubw %%mm0, %%mm4          \n\t"
03617             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03618             "psubw %%mm0, %%mm4          \n\t"
03619             "psubw %%mm7, %%mm5          \n\t"
03620             "pxor %%mm0, %%mm0           \n\t"
03621             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03622             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03623             "psubw %%mm7, %%mm5          \n\t"
03624             "psubw %%mm0, %%mm6          \n\t"
03625             //  test pa <= pb
03626             "movq %%mm4, %%mm7           \n\t"
03627             "psubw %%mm0, %%mm6          \n\t"
03628             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03629             "movq %%mm7, %%mm0           \n\t"
03630             // use mm7 mask to merge pa & pb
03631             "pand %%mm7, %%mm5           \n\t"
03632             // use mm0 mask copy to merge a & b
03633             "pand %%mm0, %%mm2           \n\t"
03634             "pandn %%mm4, %%mm7          \n\t"
03635             "pandn %%mm1, %%mm0          \n\t"
03636             "paddw %%mm5, %%mm7          \n\t"
03637             "paddw %%mm2, %%mm0          \n\t"
03638             //  test  ((pa <= pb)? pa:pb) <= pc
03639             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03640             "pxor %%mm1, %%mm1           \n\t"
03641             "pand %%mm7, %%mm3           \n\t"
03642             "pandn %%mm0, %%mm7          \n\t"
03643             "paddw %%mm3, %%mm7          \n\t"
03644             "pxor %%mm0, %%mm0           \n\t"
03645             "packuswb %%mm1, %%mm7       \n\t"
03646             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
03647             "pand _ActiveMask, %%mm7     \n\t"
03648             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
03649             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
03650             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03651             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03652             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
03653                                                // Raw(x-bpp)
03654             // now do Paeth for 2nd set of bytes (3-5)
03655             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
03656             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03657             "pxor %%mm7, %%mm7           \n\t"
03658             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03659             // pbv = p - b = (a + b - c) - b = a - c
03660             "movq %%mm1, %%mm5           \n\t"
03661             // pav = p - a = (a + b - c) - a = b - c
03662             "movq %%mm2, %%mm4           \n\t"
03663             "psubw %%mm3, %%mm5          \n\t"
03664             "psubw %%mm3, %%mm4          \n\t"
03665             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
03666             //       pav + pbv = pbv + pav
03667             "movq %%mm5, %%mm6           \n\t"
03668             "paddw %%mm4, %%mm6          \n\t"
03669 
03670             // pa = abs(p-a) = abs(pav)
03671             // pb = abs(p-b) = abs(pbv)
03672             // pc = abs(p-c) = abs(pcv)
03673             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
03674             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
03675             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
03676             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
03677             "psubw %%mm0, %%mm5          \n\t"
03678             "psubw %%mm7, %%mm4          \n\t"
03679             "psubw %%mm0, %%mm5          \n\t"
03680             "psubw %%mm7, %%mm4          \n\t"
03681             "pxor %%mm0, %%mm0           \n\t"
03682             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03683             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03684             "psubw %%mm0, %%mm6          \n\t"
03685             //  test pa <= pb
03686             "movq %%mm4, %%mm7           \n\t"
03687             "psubw %%mm0, %%mm6          \n\t"
03688             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03689             "movq %%mm7, %%mm0           \n\t"
03690             // use mm7 mask to merge pa & pb
03691             "pand %%mm7, %%mm5           \n\t"
03692             // use mm0 mask copy to merge a & b
03693             "pand %%mm0, %%mm2           \n\t"
03694             "pandn %%mm4, %%mm7          \n\t"
03695             "pandn %%mm1, %%mm0          \n\t"
03696             "paddw %%mm5, %%mm7          \n\t"
03697             "paddw %%mm2, %%mm0          \n\t"
03698             //  test  ((pa <= pb)? pa:pb) <= pc
03699             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03700             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03701             "pand %%mm7, %%mm3           \n\t"
03702             "pandn %%mm0, %%mm7          \n\t"
03703             "pxor %%mm1, %%mm1           \n\t"
03704             "paddw %%mm3, %%mm7          \n\t"
03705             "pxor %%mm0, %%mm0           \n\t"
03706             "packuswb %%mm1, %%mm7       \n\t"
03707             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
03708             "pand _ActiveMask, %%mm7     \n\t"
03709             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03710             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
03711                                                // 3 bytes
03712              // pav = p - a = (a + b - c) - a = b - c
03713             "movq %%mm2, %%mm4           \n\t"
03714             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
03715             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
03716             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03717             "movq %%mm7, %%mm1           \n\t"
03718             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03719             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
03720                                     // now mm1 will be used as Raw(x-bpp)
03721             // now do Paeth for 3rd, and final, set of bytes (6-7)
03722             "pxor %%mm7, %%mm7           \n\t"
03723             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03724             "psubw %%mm3, %%mm4          \n\t"
03725             // pbv = p - b = (a + b - c) - b = a - c
03726             "movq %%mm1, %%mm5           \n\t"
03727             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03728             "movq %%mm4, %%mm6           \n\t"
03729             "psubw %%mm3, %%mm5          \n\t"
03730             "pxor %%mm0, %%mm0           \n\t"
03731             "paddw %%mm5, %%mm6          \n\t"
03732 
03733             // pa = abs(p-a) = abs(pav)
03734             // pb = abs(p-b) = abs(pbv)
03735             // pc = abs(p-c) = abs(pcv)
03736             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03737             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03738             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03739             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03740             "psubw %%mm0, %%mm4          \n\t"
03741             "psubw %%mm7, %%mm5          \n\t"
03742             "psubw %%mm0, %%mm4          \n\t"
03743             "psubw %%mm7, %%mm5          \n\t"
03744             "pxor %%mm0, %%mm0           \n\t"
03745             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03746             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03747             "psubw %%mm0, %%mm6          \n\t"
03748             //  test pa <= pb
03749             "movq %%mm4, %%mm7           \n\t"
03750             "psubw %%mm0, %%mm6          \n\t"
03751             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03752             "movq %%mm7, %%mm0           \n\t"
03753             // use mm0 mask copy to merge a & b
03754             "pand %%mm0, %%mm2           \n\t"
03755             // use mm7 mask to merge pa & pb
03756             "pand %%mm7, %%mm5           \n\t"
03757             "pandn %%mm1, %%mm0          \n\t"
03758             "pandn %%mm4, %%mm7          \n\t"
03759             "paddw %%mm2, %%mm0          \n\t"
03760             "paddw %%mm5, %%mm7          \n\t"
03761             //  test  ((pa <= pb)? pa:pb) <= pc
03762             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03763             "pand %%mm7, %%mm3           \n\t"
03764             "pandn %%mm0, %%mm7          \n\t"
03765             "paddw %%mm3, %%mm7          \n\t"
03766             "pxor %%mm1, %%mm1           \n\t"
03767             "packuswb %%mm7, %%mm1       \n\t"
03768             // step ecx to next set of 8 bytes and repeat loop til done
03769             "addl $8, %%ecx              \n\t"
03770             "pand _ActiveMaskEnd, %%mm1  \n\t"
03771             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
03772                                                  // Raw(x)
03773 
03774             "cmpl _MMXLength, %%ecx      \n\t"
03775             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
03776             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
03777                                  // mm1 will be used as Raw(x-bpp) next loop
03778                            // mm3 ready to be used as Prior(x-bpp) next loop
03779             "jb paeth_3lp                \n\t"
03780 
03781             : "=S" (dummy_value_S),             // output regs (dummy)
03782               "=D" (dummy_value_D)
03783 
03784             : "0" (prev_row),  // esi           // input regs
03785               "1" (row)        // edi
03786 
03787             : "%ecx"                            // clobber list
03788 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03789             , "%mm0", "%mm1", "%mm2", "%mm3"
03790             , "%mm4", "%mm5", "%mm6", "%mm7"
03791 #endif
03792          );
03793       }
03794       break;  // end 3 bpp
03795 
03796       case 6:
03797       //case 7:   // GRR BOGUS
03798       //case 5:   // GRR BOGUS
03799       {
03800          _ActiveMask.use  = 0x00000000ffffffffLL;
03801          _ActiveMask2.use = 0xffffffff00000000LL;
03802          _ShiftBpp.use = bpp << 3;    // == bpp * 8
03803          _ShiftRem.use = 64 - _ShiftBpp.use;
03804 
03805          __asm__ __volatile__ (
03806             "movl _dif, %%ecx            \n\t"
03807 // preload  "movl row, %%edi             \n\t"
03808 // preload  "movl prev_row, %%esi        \n\t"
03809             // prime the pump:  load the first Raw(x-bpp) data set
03810             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03811             "pxor %%mm0, %%mm0           \n\t"
03812 
03813          "paeth_6lp:                     \n\t"
03814             // must shift to position Raw(x-bpp) data
03815             "psrlq _ShiftRem, %%mm1      \n\t"
03816             // do first set of 4 bytes
03817             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
03818             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
03819             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03820             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
03821             // must shift to position Prior(x-bpp) data
03822             "psrlq _ShiftRem, %%mm3      \n\t"
03823             // pav = p - a = (a + b - c) - a = b - c
03824             "movq %%mm2, %%mm4           \n\t"
03825             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
03826             // pbv = p - b = (a + b - c) - b = a - c
03827             "movq %%mm1, %%mm5           \n\t"
03828             "psubw %%mm3, %%mm4          \n\t"
03829             "pxor %%mm7, %%mm7           \n\t"
03830             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03831             "movq %%mm4, %%mm6           \n\t"
03832             "psubw %%mm3, %%mm5          \n\t"
03833             // pa = abs(p-a) = abs(pav)
03834             // pb = abs(p-b) = abs(pbv)
03835             // pc = abs(p-c) = abs(pcv)
03836             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03837             "paddw %%mm5, %%mm6          \n\t"
03838             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03839             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03840             "psubw %%mm0, %%mm4          \n\t"
03841             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03842             "psubw %%mm0, %%mm4          \n\t"
03843             "psubw %%mm7, %%mm5          \n\t"
03844             "pxor %%mm0, %%mm0           \n\t"
03845             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03846             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03847             "psubw %%mm7, %%mm5          \n\t"
03848             "psubw %%mm0, %%mm6          \n\t"
03849             //  test pa <= pb
03850             "movq %%mm4, %%mm7           \n\t"
03851             "psubw %%mm0, %%mm6          \n\t"
03852             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03853             "movq %%mm7, %%mm0           \n\t"
03854             // use mm7 mask to merge pa & pb
03855             "pand %%mm7, %%mm5           \n\t"
03856             // use mm0 mask copy to merge a & b
03857             "pand %%mm0, %%mm2           \n\t"
03858             "pandn %%mm4, %%mm7          \n\t"
03859             "pandn %%mm1, %%mm0          \n\t"
03860             "paddw %%mm5, %%mm7          \n\t"
03861             "paddw %%mm2, %%mm0          \n\t"
03862             //  test  ((pa <= pb)? pa:pb) <= pc
03863             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03864             "pxor %%mm1, %%mm1           \n\t"
03865             "pand %%mm7, %%mm3           \n\t"
03866             "pandn %%mm0, %%mm7          \n\t"
03867             "paddw %%mm3, %%mm7          \n\t"
03868             "pxor %%mm0, %%mm0           \n\t"
03869             "packuswb %%mm1, %%mm7       \n\t"
03870             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
03871             "pand _ActiveMask, %%mm7     \n\t"
03872             "psrlq _ShiftRem, %%mm3      \n\t"
03873             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
03874             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
03875             "movq %%mm2, %%mm6           \n\t"
03876             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
03877             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
03878             "psllq _ShiftBpp, %%mm6      \n\t"
03879             "movq %%mm7, %%mm5           \n\t"
03880             "psrlq _ShiftRem, %%mm1      \n\t"
03881             "por %%mm6, %%mm3            \n\t"
03882             "psllq _ShiftBpp, %%mm5      \n\t"
03883             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03884             "por %%mm5, %%mm1            \n\t"
03885             // do second set of 4 bytes
03886             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03887             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
03888             // pav = p - a = (a + b - c) - a = b - c
03889             "movq %%mm2, %%mm4           \n\t"
03890             // pbv = p - b = (a + b - c) - b = a - c
03891             "movq %%mm1, %%mm5           \n\t"
03892             "psubw %%mm3, %%mm4          \n\t"
03893             "pxor %%mm7, %%mm7           \n\t"
03894             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03895             "movq %%mm4, %%mm6           \n\t"
03896             "psubw %%mm3, %%mm5          \n\t"
03897             // pa = abs(p-a) = abs(pav)
03898             // pb = abs(p-b) = abs(pbv)
03899             // pc = abs(p-c) = abs(pcv)
03900             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03901             "paddw %%mm5, %%mm6          \n\t"
03902             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03903             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03904             "psubw %%mm0, %%mm4          \n\t"
03905             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03906             "psubw %%mm0, %%mm4          \n\t"
03907             "psubw %%mm7, %%mm5          \n\t"
03908             "pxor %%mm0, %%mm0           \n\t"
03909             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03910             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
03911             "psubw %%mm7, %%mm5          \n\t"
03912             "psubw %%mm0, %%mm6          \n\t"
03913             //  test pa <= pb
03914             "movq %%mm4, %%mm7           \n\t"
03915             "psubw %%mm0, %%mm6          \n\t"
03916             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
03917             "movq %%mm7, %%mm0           \n\t"
03918             // use mm7 mask to merge pa & pb
03919             "pand %%mm7, %%mm5           \n\t"
03920             // use mm0 mask copy to merge a & b
03921             "pand %%mm0, %%mm2           \n\t"
03922             "pandn %%mm4, %%mm7          \n\t"
03923             "pandn %%mm1, %%mm0          \n\t"
03924             "paddw %%mm5, %%mm7          \n\t"
03925             "paddw %%mm2, %%mm0          \n\t"
03926             //  test  ((pa <= pb)? pa:pb) <= pc
03927             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
03928             "pxor %%mm1, %%mm1           \n\t"
03929             "pand %%mm7, %%mm3           \n\t"
03930             "pandn %%mm0, %%mm7          \n\t"
03931             "pxor %%mm1, %%mm1           \n\t"
03932             "paddw %%mm3, %%mm7          \n\t"
03933             "pxor %%mm0, %%mm0           \n\t"
03934             // step ecx to next set of 8 bytes and repeat loop til done
03935             "addl $8, %%ecx              \n\t"
03936             "packuswb %%mm7, %%mm1       \n\t"
03937             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
03938             "cmpl _MMXLength, %%ecx      \n\t"
03939             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
03940                                 // mm1 will be used as Raw(x-bpp) next loop
03941             "jb paeth_6lp                \n\t"
03942 
03943             : "=S" (dummy_value_S),             // output regs (dummy)
03944               "=D" (dummy_value_D)
03945 
03946             : "0" (prev_row),  // esi           // input regs
03947               "1" (row)        // edi
03948 
03949             : "%ecx"                            // clobber list
03950 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
03951             , "%mm0", "%mm1", "%mm2", "%mm3"
03952             , "%mm4", "%mm5", "%mm6", "%mm7"
03953 #endif
03954          );
03955       }
03956       break;  // end 6 bpp
03957 
03958       case 4:
03959       {
03960          _ActiveMask.use  = 0x00000000ffffffffLL;
03961 
03962          __asm__ __volatile__ (
03963             "movl _dif, %%ecx            \n\t"
03964 // preload  "movl row, %%edi             \n\t"
03965 // preload  "movl prev_row, %%esi        \n\t"
03966             "pxor %%mm0, %%mm0           \n\t"
03967             // prime the pump:  load the first Raw(x-bpp) data set
03968             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
03969                                      //  a=Raw(x-bpp) bytes
03970          "paeth_4lp:                     \n\t"
03971             // do first set of 4 bytes
03972             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
03973             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
03974             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
03975             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
03976             // pav = p - a = (a + b - c) - a = b - c
03977             "movq %%mm2, %%mm4           \n\t"
03978             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
03979             // pbv = p - b = (a + b - c) - b = a - c
03980             "movq %%mm1, %%mm5           \n\t"
03981             "psubw %%mm3, %%mm4          \n\t"
03982             "pxor %%mm7, %%mm7           \n\t"
03983             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03984             "movq %%mm4, %%mm6           \n\t"
03985             "psubw %%mm3, %%mm5          \n\t"
03986             // pa = abs(p-a) = abs(pav)
03987             // pb = abs(p-b) = abs(pbv)
03988             // pc = abs(p-c) = abs(pcv)
03989             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
03990             "paddw %%mm5, %%mm6          \n\t"
03991             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
03992             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
03993             "psubw %%mm0, %%mm4          \n\t"
03994             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
03995             "psubw %%mm0, %%mm4          \n\t"
03996             "psubw %%mm7, %%mm5          \n\t"
03997             "pxor %%mm0, %%mm0           \n\t"
03998             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
03999             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04000             "psubw %%mm7, %%mm5          \n\t"
04001             "psubw %%mm0, %%mm6          \n\t"
04002             //  test pa <= pb
04003             "movq %%mm4, %%mm7           \n\t"
04004             "psubw %%mm0, %%mm6          \n\t"
04005             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04006             "movq %%mm7, %%mm0           \n\t"
04007             // use mm7 mask to merge pa & pb
04008             "pand %%mm7, %%mm5           \n\t"
04009             // use mm0 mask copy to merge a & b
04010             "pand %%mm0, %%mm2           \n\t"
04011             "pandn %%mm4, %%mm7          \n\t"
04012             "pandn %%mm1, %%mm0          \n\t"
04013             "paddw %%mm5, %%mm7          \n\t"
04014             "paddw %%mm2, %%mm0          \n\t"
04015             //  test  ((pa <= pb)? pa:pb) <= pc
04016             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04017             "pxor %%mm1, %%mm1           \n\t"
04018             "pand %%mm7, %%mm3           \n\t"
04019             "pandn %%mm0, %%mm7          \n\t"
04020             "paddw %%mm3, %%mm7          \n\t"
04021             "pxor %%mm0, %%mm0           \n\t"
04022             "packuswb %%mm1, %%mm7       \n\t"
04023             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
04024             "pand _ActiveMask, %%mm7     \n\t"
04025             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
04026             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
04027             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
04028             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
04029             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
04030             // do second set of 4 bytes
04031             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
04032             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
04033             // pav = p - a = (a + b - c) - a = b - c
04034             "movq %%mm2, %%mm4           \n\t"
04035             // pbv = p - b = (a + b - c) - b = a - c
04036             "movq %%mm1, %%mm5           \n\t"
04037             "psubw %%mm3, %%mm4          \n\t"
04038             "pxor %%mm7, %%mm7           \n\t"
04039             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04040             "movq %%mm4, %%mm6           \n\t"
04041             "psubw %%mm3, %%mm5          \n\t"
04042             // pa = abs(p-a) = abs(pav)
04043             // pb = abs(p-b) = abs(pbv)
04044             // pc = abs(p-c) = abs(pcv)
04045             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04046             "paddw %%mm5, %%mm6          \n\t"
04047             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04048             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04049             "psubw %%mm0, %%mm4          \n\t"
04050             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04051             "psubw %%mm0, %%mm4          \n\t"
04052             "psubw %%mm7, %%mm5          \n\t"
04053             "pxor %%mm0, %%mm0           \n\t"
04054             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04055             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04056             "psubw %%mm7, %%mm5          \n\t"
04057             "psubw %%mm0, %%mm6          \n\t"
04058             //  test pa <= pb
04059             "movq %%mm4, %%mm7           \n\t"
04060             "psubw %%mm0, %%mm6          \n\t"
04061             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04062             "movq %%mm7, %%mm0           \n\t"
04063             // use mm7 mask to merge pa & pb
04064             "pand %%mm7, %%mm5           \n\t"
04065             // use mm0 mask copy to merge a & b
04066             "pand %%mm0, %%mm2           \n\t"
04067             "pandn %%mm4, %%mm7          \n\t"
04068             "pandn %%mm1, %%mm0          \n\t"
04069             "paddw %%mm5, %%mm7          \n\t"
04070             "paddw %%mm2, %%mm0          \n\t"
04071             //  test  ((pa <= pb)? pa:pb) <= pc
04072             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04073             "pxor %%mm1, %%mm1           \n\t"
04074             "pand %%mm7, %%mm3           \n\t"
04075             "pandn %%mm0, %%mm7          \n\t"
04076             "pxor %%mm1, %%mm1           \n\t"
04077             "paddw %%mm3, %%mm7          \n\t"
04078             "pxor %%mm0, %%mm0           \n\t"
04079             // step ecx to next set of 8 bytes and repeat loop til done
04080             "addl $8, %%ecx              \n\t"
04081             "packuswb %%mm7, %%mm1       \n\t"
04082             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
04083             "cmpl _MMXLength, %%ecx      \n\t"
04084             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
04085                                 // mm1 will be used as Raw(x-bpp) next loop
04086             "jb paeth_4lp                \n\t"
04087 
04088             : "=S" (dummy_value_S),             // output regs (dummy)
04089               "=D" (dummy_value_D)
04090 
04091             : "0" (prev_row),  // esi           // input regs
04092               "1" (row)        // edi
04093 
04094             : "%ecx"                            // clobber list
04095 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
04096             , "%mm0", "%mm1", "%mm2", "%mm3"
04097             , "%mm4", "%mm5", "%mm6", "%mm7"
04098 #endif
04099          );
04100       }
04101       break;  // end 4 bpp
04102 
04103       case 8:                          // bpp == 8
04104       {
04105          _ActiveMask.use  = 0x00000000ffffffffLL;
04106 
04107          __asm__ __volatile__ (
04108             "movl _dif, %%ecx            \n\t"
04109 // preload  "movl row, %%edi             \n\t"
04110 // preload  "movl prev_row, %%esi        \n\t"
04111             "pxor %%mm0, %%mm0           \n\t"
04112             // prime the pump:  load the first Raw(x-bpp) data set
04113             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
04114                                        //  a=Raw(x-bpp) bytes
04115          "paeth_8lp:                     \n\t"
04116             // do first set of 4 bytes
04117             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
04118             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
04119             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
04120             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
04121             // pav = p - a = (a + b - c) - a = b - c
04122             "movq %%mm2, %%mm4           \n\t"
04123             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
04124             // pbv = p - b = (a + b - c) - b = a - c
04125             "movq %%mm1, %%mm5           \n\t"
04126             "psubw %%mm3, %%mm4          \n\t"
04127             "pxor %%mm7, %%mm7           \n\t"
04128             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04129             "movq %%mm4, %%mm6           \n\t"
04130             "psubw %%mm3, %%mm5          \n\t"
04131             // pa = abs(p-a) = abs(pav)
04132             // pb = abs(p-b) = abs(pbv)
04133             // pc = abs(p-c) = abs(pcv)
04134             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04135             "paddw %%mm5, %%mm6          \n\t"
04136             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04137             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04138             "psubw %%mm0, %%mm4          \n\t"
04139             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04140             "psubw %%mm0, %%mm4          \n\t"
04141             "psubw %%mm7, %%mm5          \n\t"
04142             "pxor %%mm0, %%mm0           \n\t"
04143             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04144             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04145             "psubw %%mm7, %%mm5          \n\t"
04146             "psubw %%mm0, %%mm6          \n\t"
04147             //  test pa <= pb
04148             "movq %%mm4, %%mm7           \n\t"
04149             "psubw %%mm0, %%mm6          \n\t"
04150             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04151             "movq %%mm7, %%mm0           \n\t"
04152             // use mm7 mask to merge pa & pb
04153             "pand %%mm7, %%mm5           \n\t"
04154             // use mm0 mask copy to merge a & b
04155             "pand %%mm0, %%mm2           \n\t"
04156             "pandn %%mm4, %%mm7          \n\t"
04157             "pandn %%mm1, %%mm0          \n\t"
04158             "paddw %%mm5, %%mm7          \n\t"
04159             "paddw %%mm2, %%mm0          \n\t"
04160             //  test  ((pa <= pb)? pa:pb) <= pc
04161             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04162             "pxor %%mm1, %%mm1           \n\t"
04163             "pand %%mm7, %%mm3           \n\t"
04164             "pandn %%mm0, %%mm7          \n\t"
04165             "paddw %%mm3, %%mm7          \n\t"
04166             "pxor %%mm0, %%mm0           \n\t"
04167             "packuswb %%mm1, %%mm7       \n\t"
04168             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
04169             "pand _ActiveMask, %%mm7     \n\t"
04170             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
04171             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
04172             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
04173             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
04174             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
04175 
04176             // do second set of 4 bytes
04177             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
04178             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
04179             // pav = p - a = (a + b - c) - a = b - c
04180             "movq %%mm2, %%mm4           \n\t"
04181             // pbv = p - b = (a + b - c) - b = a - c
04182             "movq %%mm1, %%mm5           \n\t"
04183             "psubw %%mm3, %%mm4          \n\t"
04184             "pxor %%mm7, %%mm7           \n\t"
04185             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04186             "movq %%mm4, %%mm6           \n\t"
04187             "psubw %%mm3, %%mm5          \n\t"
04188             // pa = abs(p-a) = abs(pav)
04189             // pb = abs(p-b) = abs(pbv)
04190             // pc = abs(p-c) = abs(pcv)
04191             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
04192             "paddw %%mm5, %%mm6          \n\t"
04193             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
04194             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
04195             "psubw %%mm0, %%mm4          \n\t"
04196             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
04197             "psubw %%mm0, %%mm4          \n\t"
04198             "psubw %%mm7, %%mm5          \n\t"
04199             "pxor %%mm0, %%mm0           \n\t"
04200             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
04201             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
04202             "psubw %%mm7, %%mm5          \n\t"
04203             "psubw %%mm0, %%mm6          \n\t"
04204             //  test pa <= pb
04205             "movq %%mm4, %%mm7           \n\t"
04206             "psubw %%mm0, %%mm6          \n\t"
04207             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
04208             "movq %%mm7, %%mm0           \n\t"
04209             // use mm7 mask to merge pa & pb
04210             "pand %%mm7, %%mm5           \n\t"
04211             // use mm0 mask copy to merge a & b
04212             "pand %%mm0, %%mm2           \n\t"
04213             "pandn %%mm4, %%mm7          \n\t"
04214             "pandn %%mm1, %%mm0          \n\t"
04215             "paddw %%mm5, %%mm7          \n\t"
04216             "paddw %%mm2, %%mm0          \n\t"
04217             //  test  ((pa <= pb)? pa:pb) <= pc
04218             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
04219             "pxor %%mm1, %%mm1           \n\t"
04220             "pand %%mm7, %%mm3           \n\t"
04221             "pandn %%mm0, %%mm7          \n\t"
04222             "pxor %%mm1, %%mm1           \n\t"
04223             "paddw %%mm3, %%mm7          \n\t"
04224             "pxor %%mm0, %%mm0           \n\t"
04225             // step ecx to next set of 8 bytes and repeat loop til done
04226             "addl $8, %%ecx              \n\t"
04227             "packuswb %%mm7, %%mm1       \n\t"
04228             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
04229             "cmpl _MMXLength, %%ecx      \n\t"
04230             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
04231                             // mm1 will be used as Raw(x-bpp) next loop
04232             "jb paeth_8lp                \n\t"
04233 
04234             : "=S" (dummy_value_S),             // output regs (dummy)
04235               "=D" (dummy_value_D)
04236 
04237             : "0" (prev_row),  // esi           // input regs
04238               "1" (row)        // edi
04239 
04240             : "%ecx"                            // clobber list
04241 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
04242             , "%mm0", "%mm1", "%mm2", "%mm3"
04243             , "%mm4", "%mm5", "%mm6", "%mm7"
04244 #endif
04245          );
04246       }
04247       break;  // end 8 bpp
04248 
04249       case 1:                // bpp = 1
04250       case 2:                // bpp = 2
04251       default:               // bpp > 8
04252       {
04253          __asm__ __volatile__ (
04254 #ifdef __PIC__
04255             "pushl %%ebx                 \n\t" // save Global Offset Table index
04256 #endif
04257             "movl _dif, %%ebx            \n\t"
04258             "cmpl _FullLength, %%ebx     \n\t"
04259             "jnb paeth_dend              \n\t"
04260 
04261 // preload  "movl row, %%edi             \n\t"
04262 // preload  "movl prev_row, %%esi        \n\t"
04263             // do Paeth decode for remaining bytes
04264             "movl %%ebx, %%edx           \n\t"
04265 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
04266             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
04267             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
04268 
04269          "paeth_dlp:                     \n\t"
04270             "xorl %%eax, %%eax           \n\t"
04271             // pav = p - a = (a + b - c) - a = b - c
04272             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
04273             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04274             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04275             "movl %%eax, _patemp         \n\t" // Save pav for later use
04276             "xorl %%eax, %%eax           \n\t"
04277             // pbv = p - b = (a + b - c) - b = a - c
04278             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
04279             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04280             "movl %%eax, %%ecx           \n\t"
04281             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04282             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
04283             // pc = abs(pcv)
04284             "testl $0x80000000, %%eax    \n\t"
04285             "jz paeth_dpca               \n\t"
04286             "negl %%eax                  \n\t" // reverse sign of neg values
04287 
04288          "paeth_dpca:                    \n\t"
04289             "movl %%eax, _pctemp         \n\t" // save pc for later use
04290             // pb = abs(pbv)
04291             "testl $0x80000000, %%ecx    \n\t"
04292             "jz paeth_dpba               \n\t"
04293             "negl %%ecx                  \n\t" // reverse sign of neg values
04294 
04295          "paeth_dpba:                    \n\t"
04296             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
04297             // pa = abs(pav)
04298             "movl _patemp, %%eax         \n\t"
04299             "testl $0x80000000, %%eax    \n\t"
04300             "jz paeth_dpaa               \n\t"
04301             "negl %%eax                  \n\t" // reverse sign of neg values
04302 
04303          "paeth_dpaa:                    \n\t"
04304             "movl %%eax, _patemp         \n\t" // save pa for later use
04305             // test if pa <= pb
04306             "cmpl %%ecx, %%eax           \n\t"
04307             "jna paeth_dabb              \n\t"
04308             // pa > pb; now test if pb <= pc
04309             "cmpl _pctemp, %%ecx         \n\t"
04310             "jna paeth_dbbc              \n\t"
04311             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04312             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04313             "jmp paeth_dpaeth            \n\t"
04314 
04315          "paeth_dbbc:                    \n\t"
04316             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
04317             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
04318             "jmp paeth_dpaeth            \n\t"
04319 
04320          "paeth_dabb:                    \n\t"
04321             // pa <= pb; now test if pa <= pc
04322             "cmpl _pctemp, %%eax         \n\t"
04323             "jna paeth_dabc              \n\t"
04324             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04325             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04326             "jmp paeth_dpaeth            \n\t"
04327 
04328          "paeth_dabc:                    \n\t"
04329             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
04330             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
04331 
04332          "paeth_dpaeth:                  \n\t"
04333             "incl %%ebx                  \n\t"
04334             "incl %%edx                  \n\t"
04335             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
04336             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
04337             "cmpl _FullLength, %%ebx     \n\t"
04338             "jb paeth_dlp                \n\t"
04339 
04340          "paeth_dend:                    \n\t"
04341 #ifdef __PIC__
04342             "popl %%ebx                  \n\t" // index to Global Offset Table
04343 #endif
04344 
04345             : "=c" (dummy_value_c),            // output regs (dummy)
04346               "=S" (dummy_value_S),
04347               "=D" (dummy_value_D)
04348 
04349             : "0" (bpp),       // ecx          // input regs
04350               "1" (prev_row),  // esi
04351               "2" (row)        // edi
04352 
04353             : "%eax", "%edx"                   // clobber list
04354 #ifndef __PIC__
04355             , "%ebx"
04356 #endif
04357          );
04358       }
04359       return;                   // No need to go further with this one
04360 
04361    } // end switch (bpp)
04362 
04363    __asm__ __volatile__ (
04364       // MMX acceleration complete; now do clean-up
04365       // check if any remaining bytes left to decode
04366 #ifdef __PIC__
04367       "pushl %%ebx                 \n\t" // save index to Global Offset Table
04368 #endif
04369       "movl _MMXLength, %%ebx      \n\t"
04370       "cmpl _FullLength, %%ebx     \n\t"
04371       "jnb paeth_end               \n\t"
04372 //pre "movl row, %%edi             \n\t"
04373 //pre "movl prev_row, %%esi        \n\t"
04374       // do Paeth decode for remaining bytes
04375       "movl %%ebx, %%edx           \n\t"
04376 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
04377       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
04378       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
04379 
04380    "paeth_lp2:                     \n\t"
04381       "xorl %%eax, %%eax           \n\t"
04382       // pav = p - a = (a + b - c) - a = b - c
04383       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
04384       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04385       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04386       "movl %%eax, _patemp         \n\t" // Save pav for later use
04387       "xorl %%eax, %%eax           \n\t"
04388       // pbv = p - b = (a + b - c) - b = a - c
04389       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
04390       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
04391       "movl %%eax, %%ecx           \n\t"
04392       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
04393       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
04394       // pc = abs(pcv)
04395       "testl $0x80000000, %%eax    \n\t"
04396       "jz paeth_pca2               \n\t"
04397       "negl %%eax                  \n\t" // reverse sign of neg values
04398 
04399    "paeth_pca2:                    \n\t"
04400       "movl %%eax, _pctemp         \n\t" // save pc for later use
04401       // pb = abs(pbv)
04402       "testl $0x80000000, %%ecx    \n\t"
04403       "jz paeth_pba2               \n\t"
04404       "negl %%ecx                  \n\t" // reverse sign of neg values
04405 
04406    "paeth_pba2:                    \n\t"
04407       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
04408       // pa = abs(pav)
04409       "movl _patemp, %%eax         \n\t"
04410       "testl $0x80000000, %%eax    \n\t"
04411       "jz paeth_paa2               \n\t"
04412       "negl %%eax                  \n\t" // reverse sign of neg values
04413 
04414    "paeth_paa2:                    \n\t"
04415       "movl %%eax, _patemp         \n\t" // save pa for later use
04416       // test if pa <= pb
04417       "cmpl %%ecx, %%eax           \n\t"
04418       "jna paeth_abb2              \n\t"
04419       // pa > pb; now test if pb <= pc
04420       "cmpl _pctemp, %%ecx         \n\t"
04421       "jna paeth_bbc2              \n\t"
04422       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04423       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04424       "jmp paeth_paeth2            \n\t"
04425 
04426    "paeth_bbc2:                    \n\t"
04427       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
04428       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
04429       "jmp paeth_paeth2            \n\t"
04430 
04431    "paeth_abb2:                    \n\t"
04432       // pa <= pb; now test if pa <= pc
04433       "cmpl _pctemp, %%eax         \n\t"
04434       "jna paeth_abc2              \n\t"
04435       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
04436       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
04437       "jmp paeth_paeth2            \n\t"
04438 
04439    "paeth_abc2:                    \n\t"
04440       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
04441       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
04442 
04443    "paeth_paeth2:                  \n\t"
04444       "incl %%ebx                  \n\t"
04445       "incl %%edx                  \n\t"
04446       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
04447       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
04448       "cmpl _FullLength, %%ebx     \n\t"
04449       "jb paeth_lp2                \n\t"
04450 
04451    "paeth_end:                     \n\t"
04452       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
04453 #ifdef __PIC__
04454       "popl %%ebx                  \n\t" // restore index to Global Offset Table
04455 #endif
04456 
04457       : "=c" (dummy_value_c),            // output regs (dummy)
04458         "=S" (dummy_value_S),
04459         "=D" (dummy_value_D)
04460 
04461       : "0" (bpp),       // ecx          // input regs
04462         "1" (prev_row),  // esi
04463         "2" (row)        // edi
04464 
04465       : "%eax", "%edx"                   // clobber list (no input regs!)
04466 #ifndef __PIC__
04467       , "%ebx"
04468 #endif
04469    );
04470 
04471 } /* end png_read_filter_row_mmx_paeth() */
04472 #endif
04473 
04474 
04475 
04476 
04477 #ifdef PNG_THREAD_UNSAFE_OK
04478 //===========================================================================//
04479 //                                                                           //
04480 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
04481 //                                                                           //
04482 //===========================================================================//
04483 
04484 // Optimized code for PNG Sub filter decoder
04485 
04486 static void /* PRIVATE */
04487 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
04488 {
04489    int bpp;
04490    int dummy_value_a;
04491    int dummy_value_D;
04492 
04493    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
04494    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
04495 
04496    __asm__ __volatile__ (
04497 //pre "movl row, %%edi             \n\t"
04498       "movl %%edi, %%esi           \n\t" // lp = row
04499 //pre "movl bpp, %%eax             \n\t"
04500       "addl %%eax, %%edi           \n\t" // rp = row + bpp
04501 //irr "xorl %%eax, %%eax           \n\t"
04502       // get # of bytes to alignment
04503       "movl %%edi, _dif            \n\t" // take start of row
04504       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
04505                                          //  alignment boundary
04506       "xorl %%ecx, %%ecx           \n\t"
04507       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
04508       "subl %%edi, _dif            \n\t" // subtract from start ==> value
04509       "jz sub_go                   \n\t" //  ecx at alignment
04510 
04511    "sub_lp1:                       \n\t" // fix alignment
04512       "movb (%%esi,%%ecx,), %%al   \n\t"
04513       "addb %%al, (%%edi,%%ecx,)   \n\t"
04514       "incl %%ecx                  \n\t"
04515       "cmpl _dif, %%ecx            \n\t"
04516       "jb sub_lp1                  \n\t"
04517 
04518    "sub_go:                        \n\t"
04519       "movl _FullLength, %%eax     \n\t"
04520       "movl %%eax, %%edx           \n\t"
04521       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
04522       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
04523       "subl %%edx, %%eax           \n\t" // drop over bytes from length
04524       "movl %%eax, _MMXLength      \n\t"
04525 
04526       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04527         "=D" (dummy_value_D)    // 1
04528 
04529       : "0" (bpp),              // eax    // input regs
04530         "1" (row)               // edi
04531 
04532       : "%ebx", "%ecx", "%edx"            // clobber list
04533       , "%esi"
04534 
04535 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04536       , "%mm0", "%mm1", "%mm2", "%mm3"
04537       , "%mm4", "%mm5", "%mm6", "%mm7"
04538 #endif
04539    );
04540 
04541    // now do the math for the rest of the row
04542    switch (bpp)
04543    {
04544       case 3:
04545       {
04546          _ActiveMask.use  = 0x0000ffffff000000LL;
04547          _ShiftBpp.use = 24;       // == 3 * 8
04548          _ShiftRem.use  = 40;      // == 64 - 24
04549 
04550          __asm__ __volatile__ (
04551 // preload  "movl row, %%edi              \n\t"
04552             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
04553                                                 //  active byte group
04554             "movl %%edi, %%esi            \n\t" // lp = row
04555 // preload  "movl bpp, %%eax              \n\t"
04556             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04557             "movq %%mm7, %%mm6            \n\t"
04558             "movl _dif, %%edx             \n\t"
04559             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
04560                                                 //  3rd active byte group
04561             // prime the pump:  load the first Raw(x-bpp) data set
04562             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04563 
04564          "sub_3lp:                        \n\t" // shift data for adding first
04565             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04566                                                 //  shift clears inactive bytes)
04567             // add 1st active group
04568             "movq (%%edi,%%edx,), %%mm0   \n\t"
04569             "paddb %%mm1, %%mm0           \n\t"
04570 
04571             // add 2nd active group
04572             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04573             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04574             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
04575             "paddb %%mm1, %%mm0           \n\t"
04576 
04577             // add 3rd active group
04578             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04579             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04580             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
04581             "addl $8, %%edx               \n\t"
04582             "paddb %%mm1, %%mm0           \n\t"
04583 
04584             "cmpl _MMXLength, %%edx       \n\t"
04585             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
04586             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04587             "jb sub_3lp                   \n\t"
04588 
04589             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04590               "=D" (dummy_value_D)    // 1
04591 
04592             : "0" (bpp),              // eax    // input regs
04593               "1" (row)               // edi
04594 
04595             : "%edx", "%esi"                    // clobber list
04596 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04597             , "%mm0", "%mm1", "%mm6", "%mm7"
04598 #endif
04599          );
04600       }
04601       break;
04602 
04603       case 1:
04604       {
04605          __asm__ __volatile__ (
04606             "movl _dif, %%edx            \n\t"
04607 // preload  "movl row, %%edi             \n\t"
04608             "cmpl _FullLength, %%edx     \n\t"
04609             "jnb sub_1end                \n\t"
04610             "movl %%edi, %%esi           \n\t" // lp = row
04611             "xorl %%eax, %%eax           \n\t"
04612 // preload  "movl bpp, %%eax             \n\t"
04613             "addl %%eax, %%edi           \n\t" // rp = row + bpp
04614 
04615          "sub_1lp:                       \n\t"
04616             "movb (%%esi,%%edx,), %%al   \n\t"
04617             "addb %%al, (%%edi,%%edx,)   \n\t"
04618             "incl %%edx                  \n\t"
04619             "cmpl _FullLength, %%edx     \n\t"
04620             "jb sub_1lp                  \n\t"
04621 
04622          "sub_1end:                      \n\t"
04623 
04624             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04625               "=D" (dummy_value_D)    // 1
04626 
04627             : "0" (bpp),              // eax    // input regs
04628               "1" (row)               // edi
04629 
04630             : "%edx", "%esi"                    // clobber list
04631          );
04632       }
04633       return;
04634 
04635       case 6:
04636       case 4:
04637       //case 7:   // GRR BOGUS
04638       //case 5:   // GRR BOGUS
04639       {
04640          _ShiftBpp.use = bpp << 3;
04641          _ShiftRem.use = 64 - _ShiftBpp.use;
04642 
04643          __asm__ __volatile__ (
04644 // preload  "movl row, %%edi              \n\t"
04645             "movl _dif, %%edx             \n\t"
04646             "movl %%edi, %%esi            \n\t" // lp = row
04647 // preload  "movl bpp, %%eax              \n\t"
04648             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04649 
04650             // prime the pump:  load the first Raw(x-bpp) data set
04651             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04652 
04653          "sub_4lp:                        \n\t" // shift data for adding first
04654             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04655                                                 //  shift clears inactive bytes)
04656             "movq (%%edi,%%edx,), %%mm0   \n\t"
04657             "paddb %%mm1, %%mm0           \n\t"
04658 
04659             // add 2nd active group
04660             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04661             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04662             "addl $8, %%edx               \n\t"
04663             "paddb %%mm1, %%mm0           \n\t"
04664 
04665             "cmpl _MMXLength, %%edx       \n\t"
04666             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
04667             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04668             "jb sub_4lp                   \n\t"
04669 
04670             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04671               "=D" (dummy_value_D)    // 1
04672 
04673             : "0" (bpp),              // eax    // input regs
04674               "1" (row)               // edi
04675 
04676             : "%edx", "%esi"                    // clobber list
04677 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04678             , "%mm0", "%mm1"
04679 #endif
04680          );
04681       }
04682       break;
04683 
04684       case 2:
04685       {
04686          _ActiveMask.use = 0x00000000ffff0000LL;
04687          _ShiftBpp.use = 16;       // == 2 * 8
04688          _ShiftRem.use = 48;       // == 64 - 16
04689 
04690          __asm__ __volatile__ (
04691             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
04692                                                 //  active byte group
04693             "movl _dif, %%edx             \n\t"
04694             "movq %%mm7, %%mm6            \n\t"
04695 // preload  "movl row, %%edi              \n\t"
04696             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
04697                                                 //  3rd active byte group
04698             "movl %%edi, %%esi            \n\t" // lp = row
04699             "movq %%mm6, %%mm5            \n\t"
04700 // preload  "movl bpp, %%eax              \n\t"
04701             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04702             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
04703                                                 //  4th active byte group
04704             // prime the pump:  load the first Raw(x-bpp) data set
04705             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
04706 
04707          "sub_2lp:                        \n\t" // shift data for adding first
04708             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
04709                                                 //  shift clears inactive bytes)
04710             // add 1st active group
04711             "movq (%%edi,%%edx,), %%mm0   \n\t"
04712             "paddb %%mm1, %%mm0           \n\t"
04713 
04714             // add 2nd active group
04715             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04716             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04717             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
04718             "paddb %%mm1, %%mm0           \n\t"
04719 
04720             // add 3rd active group
04721             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04722             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04723             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
04724             "paddb %%mm1, %%mm0           \n\t"
04725 
04726             // add 4th active group
04727             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
04728             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
04729             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
04730             "addl $8, %%edx               \n\t"
04731             "paddb %%mm1, %%mm0           \n\t"
04732             "cmpl _MMXLength, %%edx       \n\t"
04733             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
04734             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
04735             "jb sub_2lp                   \n\t"
04736 
04737             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04738               "=D" (dummy_value_D)    // 1
04739 
04740             : "0" (bpp),              // eax    // input regs
04741               "1" (row)               // edi
04742 
04743             : "%edx", "%esi"                    // clobber list
04744 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04745             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
04746 #endif
04747          );
04748       }
04749       break;
04750 
04751       case 8:
04752       {
04753          __asm__ __volatile__ (
04754 // preload  "movl row, %%edi              \n\t"
04755             "movl _dif, %%edx             \n\t"
04756             "movl %%edi, %%esi            \n\t" // lp = row
04757 // preload  "movl bpp, %%eax              \n\t"
04758             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04759             "movl _MMXLength, %%ecx       \n\t"
04760 
04761             // prime the pump:  load the first Raw(x-bpp) data set
04762             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
04763             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
04764 
04765          "sub_8lp:                        \n\t"
04766             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
04767             "paddb %%mm7, %%mm0           \n\t"
04768             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
04769             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
04770 
04771             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
04772             // This will be repeated for each group of 8 bytes with the 8th
04773             // group being used as the Raw(x-bpp) for the 1st group of the
04774             // next loop.
04775 
04776             "paddb %%mm0, %%mm1           \n\t"
04777             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
04778             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
04779             "paddb %%mm1, %%mm2           \n\t"
04780             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
04781             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
04782             "paddb %%mm2, %%mm3           \n\t"
04783             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
04784             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
04785             "paddb %%mm3, %%mm4           \n\t"
04786             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
04787             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
04788             "paddb %%mm4, %%mm5           \n\t"
04789             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
04790             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
04791             "paddb %%mm5, %%mm6           \n\t"
04792             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
04793             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
04794             "addl $64, %%edx              \n\t"
04795             "paddb %%mm6, %%mm7           \n\t"
04796             "cmpl %%ecx, %%edx            \n\t"
04797             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
04798             "jb sub_8lp                   \n\t"
04799 
04800             "cmpl _MMXLength, %%edx       \n\t"
04801             "jnb sub_8lt8                 \n\t"
04802 
04803          "sub_8lpA:                       \n\t"
04804             "movq (%%edi,%%edx,), %%mm0   \n\t"
04805             "addl $8, %%edx               \n\t"
04806             "paddb %%mm7, %%mm0           \n\t"
04807             "cmpl _MMXLength, %%edx       \n\t"
04808             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
04809             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
04810                                                 //  to mm1 to be new Raw(x-bpp)
04811                                                 //  for next loop
04812             "jb sub_8lpA                  \n\t"
04813 
04814          "sub_8lt8:                       \n\t"
04815 
04816             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04817               "=D" (dummy_value_D)    // 1
04818 
04819             : "0" (bpp),              // eax    // input regs
04820               "1" (row)               // edi
04821 
04822             : "%ecx", "%edx", "%esi"            // clobber list
04823 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04824             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
04825 #endif
04826          );
04827       }
04828       break;
04829 
04830       default:                // bpp greater than 8 bytes   GRR BOGUS
04831       {
04832          __asm__ __volatile__ (
04833             "movl _dif, %%edx             \n\t"
04834 // preload  "movl row, %%edi              \n\t"
04835             "movl %%edi, %%esi            \n\t" // lp = row
04836 // preload  "movl bpp, %%eax              \n\t"
04837             "addl %%eax, %%edi            \n\t" // rp = row + bpp
04838 
04839          "sub_Alp:                        \n\t"
04840             "movq (%%edi,%%edx,), %%mm0   \n\t"
04841             "movq (%%esi,%%edx,), %%mm1   \n\t"
04842             "addl $8, %%edx               \n\t"
04843             "paddb %%mm1, %%mm0           \n\t"
04844             "cmpl _MMXLength, %%edx       \n\t"
04845             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
04846                                                 //  -8 to offset addl edx
04847             "jb sub_Alp                   \n\t"
04848 
04849             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04850               "=D" (dummy_value_D)    // 1
04851 
04852             : "0" (bpp),              // eax    // input regs
04853               "1" (row)               // edi
04854 
04855             : "%edx", "%esi"                    // clobber list
04856 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
04857             , "%mm0", "%mm1"
04858 #endif
04859          );
04860       }
04861       break;
04862 
04863    } // end switch (bpp)
04864 
04865    __asm__ __volatile__ (
04866       "movl _MMXLength, %%edx       \n\t"
04867 //pre "movl row, %%edi              \n\t"
04868       "cmpl _FullLength, %%edx      \n\t"
04869       "jnb sub_end                  \n\t"
04870 
04871       "movl %%edi, %%esi            \n\t" // lp = row
04872 //pre "movl bpp, %%eax              \n\t"
04873       "addl %%eax, %%edi            \n\t" // rp = row + bpp
04874       "xorl %%eax, %%eax            \n\t"
04875 
04876    "sub_lp2:                        \n\t"
04877       "movb (%%esi,%%edx,), %%al    \n\t"
04878       "addb %%al, (%%edi,%%edx,)    \n\t"
04879       "incl %%edx                   \n\t"
04880       "cmpl _FullLength, %%edx      \n\t"
04881       "jb sub_lp2                   \n\t"
04882 
04883    "sub_end:                        \n\t"
04884       "EMMS                         \n\t" // end MMX instructions
04885 
04886       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
04887         "=D" (dummy_value_D)    // 1
04888 
04889       : "0" (bpp),              // eax    // input regs
04890         "1" (row)               // edi
04891 
04892       : "%edx", "%esi"                    // clobber list
04893    );
04894 
04895 } // end of png_read_filter_row_mmx_sub()
04896 #endif
04897 
04898 
04899 
04900 
04901 //===========================================================================//
04902 //                                                                           //
04903 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
04904 //                                                                           //
04905 //===========================================================================//
04906 
04907 // Optimized code for PNG Up filter decoder
04908 
04909 static void /* PRIVATE */
04910 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
04911                            png_bytep prev_row)
04912 {
04913    png_uint_32 len;
04914    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
04915    int dummy_value_S;
04916    int dummy_value_D;
04917 
04918    len = row_info->rowbytes;              // number of bytes to filter
04919 
04920    __asm__ __volatile__ (
04921 //pre "movl row, %%edi              \n\t"
04922       // get # of bytes to alignment
04923 #ifdef __PIC__
04924       "pushl %%ebx                  \n\t"
04925 #endif
04926       "movl %%edi, %%ecx            \n\t"
04927       "xorl %%ebx, %%ebx            \n\t"
04928       "addl $0x7, %%ecx             \n\t"
04929       "xorl %%eax, %%eax            \n\t"
04930       "andl $0xfffffff8, %%ecx      \n\t"
04931 //pre "movl prev_row, %%esi         \n\t"
04932       "subl %%edi, %%ecx            \n\t"
04933       "jz up_go                     \n\t"
04934 
04935    "up_lp1:                         \n\t" // fix alignment
04936       "movb (%%edi,%%ebx,), %%al    \n\t"
04937       "addb (%%esi,%%ebx,), %%al    \n\t"
04938       "incl %%ebx                   \n\t"
04939       "cmpl %%ecx, %%ebx            \n\t"
04940       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
04941       "jb up_lp1                    \n\t" //  offset incl ebx
04942 
04943    "up_go:                          \n\t"
04944 //pre "movl len, %%edx              \n\t"
04945       "movl %%edx, %%ecx            \n\t"
04946       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
04947       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
04948       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
04949 
04950       // unrolled loop - use all MMX registers and interleave to reduce
04951       // number of branch instructions (loops) and reduce partial stalls
04952    "up_loop:                        \n\t"
04953       "movq (%%esi,%%ebx,), %%mm1   \n\t"
04954       "movq (%%edi,%%ebx,), %%mm0   \n\t"
04955       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
04956       "paddb %%mm1, %%mm0           \n\t"
04957       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
04958       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
04959       "paddb %%mm3, %%mm2           \n\t"
04960       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
04961       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
04962       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
04963       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
04964       "paddb %%mm5, %%mm4           \n\t"
04965       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
04966       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
04967       "paddb %%mm7, %%mm6           \n\t"
04968       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
04969       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
04970       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
04971       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
04972       "paddb %%mm1, %%mm0           \n\t"
04973       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
04974       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
04975       "paddb %%mm3, %%mm2           \n\t"
04976       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
04977       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
04978       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
04979       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
04980       "paddb %%mm5, %%mm4           \n\t"
04981       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
04982       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
04983       "addl $64, %%ebx              \n\t"
04984       "paddb %%mm7, %%mm6           \n\t"
04985       "cmpl %%ecx, %%ebx            \n\t"
04986       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
04987       "jb up_loop                   \n\t" //  -8 to offset addl ebx
04988 
04989       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
04990       "jz up_end                    \n\t"
04991 
04992       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
04993       "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
04994 
04995       "addl %%edx, %%ecx            \n\t"
04996       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
04997       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
04998       "jz up_lt8                    \n\t"
04999 
05000    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
05001       "movq (%%esi,%%ebx,), %%mm1   \n\t"
05002       "movq (%%edi,%%ebx,), %%mm0   \n\t"
05003       "addl $8, %%ebx               \n\t"
05004       "paddb %%mm1, %%mm0           \n\t"
05005       "cmpl %%ecx, %%ebx            \n\t"
05006       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
05007       "jb up_lpA                    \n\t" //  offset add ebx
05008       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
05009       "jz up_end                    \n\t"
05010 
05011    "up_lt8:                         \n\t"
05012       "xorl %%eax, %%eax            \n\t"
05013       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
05014 
05015    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
05016       "movb (%%edi,%%ebx,), %%al    \n\t"
05017       "addb (%%esi,%%ebx,), %%al    \n\t"
05018       "incl %%ebx                   \n\t"
05019       "cmpl %%ecx, %%ebx            \n\t"
05020       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
05021       "jb up_lp2                    \n\t" //  offset inc ebx
05022 
05023    "up_end:                         \n\t"
05024       "EMMS                         \n\t" // conversion of filtered row complete
05025 #ifdef __PIC__
05026       "popl %%ebx                   \n\t"
05027 #endif
05028 
05029       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
05030         "=S" (dummy_value_S),   // 1
05031         "=D" (dummy_value_D)    // 2
05032 
05033       : "0" (len),              // edx    // input regs
05034         "1" (prev_row),         // esi
05035         "2" (row)               // edi
05036 
05037       : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
05038 
05039 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
05040       , "%mm0", "%mm1", "%mm2", "%mm3"
05041       , "%mm4", "%mm5", "%mm6", "%mm7"
05042 #endif
05043    );
05044 
05045 } // end of png_read_filter_row_mmx_up()
05046 
05047 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05048 
05049 
05050 
05051 
05052 /*===========================================================================*/
05053 /*                                                                           */
05054 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
05055 /*                                                                           */
05056 /*===========================================================================*/
05057 
05058 
05059 /* Optimized png_read_filter_row routines */
05060 
05061 void /* PRIVATE */
05062 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
05063    row, png_bytep prev_row, int filter)
05064 {
05065 #ifdef PNG_DEBUG
05066    char filnm[10];
05067 #endif
05068 
05069 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
05070 /* GRR:  these are superseded by png_ptr->asm_flags: */
05071 #define UseMMX_sub    1   // GRR:  converted 20000730
05072 #define UseMMX_up     1   // GRR:  converted 20000729
05073 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
05074 #define UseMMX_paeth  1   // GRR:  converted 20000828
05075 
05076    if (_mmx_supported == 2) {
05077        /* this should have happened in png_init_mmx_flags() already */
05078 #if !defined(PNG_1_0_X)
05079        png_warning(png_ptr, "asm_flags may not have been initialized");
05080 #endif
05081        png_mmx_support();
05082    }
05083 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05084 
05085 #ifdef PNG_DEBUG
05086    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
05087    switch (filter)
05088    {
05089       case 0: sprintf(filnm, "none");
05090          break;
05091       case 1: sprintf(filnm, "sub-%s",
05092 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05093 #if !defined(PNG_1_0_X)
05094         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : 
05095 #endif
05096 #endif
05097 "x86");
05098          break;
05099       case 2: sprintf(filnm, "up-%s",
05100 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
05101 #if !defined(PNG_1_0_X)
05102         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
05103 #endif
05104 #endif
05105  "x86");
05106          break;
05107       case 3: sprintf(filnm, "avg-%s",
05108 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05109 #if !defined(PNG_1_0_X)
05110         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
05111 #endif
05112 #endif
05113  "x86");
05114          break;
05115       case 4: sprintf(filnm, "Paeth-%s",
05116 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05117 #if !defined(PNG_1_0_X)
05118         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
05119 #endif
05120 #endif
05121 "x86");
05122          break;
05123       default: sprintf(filnm, "unknw");
05124          break;
05125    }
05126    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
05127    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
05128    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
05129       (int)((row_info->pixel_depth + 7) >> 3));
05130    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
05131 #endif /* PNG_DEBUG */
05132 
05133    switch (filter)
05134    {
05135       case PNG_FILTER_VALUE_NONE:
05136          break;
05137 
05138       case PNG_FILTER_VALUE_SUB:
05139 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05140 #if !defined(PNG_1_0_X)
05141          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
05142              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05143              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05144 #else
05145          if (_mmx_supported)
05146 #endif
05147          {
05148             png_read_filter_row_mmx_sub(row_info, row);
05149          }
05150          else
05151 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05152          {
05153             png_uint_32 i;
05154             png_uint_32 istop = row_info->rowbytes;
05155             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05156             png_bytep rp = row + bpp;
05157             png_bytep lp = row;
05158 
05159             for (i = bpp; i < istop; i++)
05160             {
05161                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
05162                rp++;
05163             }
05164          }  /* end !UseMMX_sub */
05165          break;
05166 
05167       case PNG_FILTER_VALUE_UP:
05168 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
05169 #if !defined(PNG_1_0_X)
05170          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
05171              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05172              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05173 #else
05174          if (_mmx_supported)
05175 #endif
05176          {
05177             png_read_filter_row_mmx_up(row_info, row, prev_row);
05178          }
05179           else
05180 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05181          {
05182             png_uint_32 i;
05183             png_uint_32 istop = row_info->rowbytes;
05184             png_bytep rp = row;
05185             png_bytep pp = prev_row;
05186 
05187             for (i = 0; i < istop; ++i)
05188             {
05189                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
05190                rp++;
05191             }
05192          }  /* end !UseMMX_up */
05193          break;
05194 
05195       case PNG_FILTER_VALUE_AVG:
05196 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05197 #if !defined(PNG_1_0_X)
05198          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
05199              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05200              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05201 #else
05202          if (_mmx_supported)
05203 #endif
05204          {
05205             png_read_filter_row_mmx_avg(row_info, row, prev_row);
05206          }
05207          else
05208 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05209          {
05210             png_uint_32 i;
05211             png_bytep rp = row;
05212             png_bytep pp = prev_row;
05213             png_bytep lp = row;
05214             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05215             png_uint_32 istop = row_info->rowbytes - bpp;
05216 
05217             for (i = 0; i < bpp; i++)
05218             {
05219                *rp = (png_byte)(((int)(*rp) +
05220                   ((int)(*pp++) >> 1)) & 0xff);
05221                rp++;
05222             }
05223 
05224             for (i = 0; i < istop; i++)
05225             {
05226                *rp = (png_byte)(((int)(*rp) +
05227                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
05228                rp++;
05229             }
05230          }  /* end !UseMMX_avg */
05231          break;
05232 
05233       case PNG_FILTER_VALUE_PAETH:
05234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
05235 #if !defined(PNG_1_0_X)
05236          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
05237              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
05238              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
05239 #else
05240          if (_mmx_supported)
05241 #endif
05242          {
05243             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
05244          }
05245          else
05246 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
05247          {
05248             png_uint_32 i;
05249             png_bytep rp = row;
05250             png_bytep pp = prev_row;
05251             png_bytep lp = row;
05252             png_bytep cp = prev_row;
05253             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
05254             png_uint_32 istop = row_info->rowbytes - bpp;
05255 
05256             for (i = 0; i < bpp; i++)
05257             {
05258                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
05259                rp++;
05260             }
05261 
05262             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
05263             {
05264                int a, b, c, pa, pb, pc, p;
05265 
05266                a = *lp++;
05267                b = *pp++;
05268                c = *cp++;
05269 
05270                p = b - c;
05271                pc = a - c;
05272 
05273 #ifdef PNG_USE_ABS
05274                pa = abs(p);
05275                pb = abs(pc);
05276                pc = abs(p + pc);
05277 #else
05278                pa = p < 0 ? -p : p;
05279                pb = pc < 0 ? -pc : pc;
05280                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
05281 #endif
05282 
05283                /*
05284                   if (pa <= pb && pa <= pc)
05285                      p = a;
05286                   else if (pb <= pc)
05287                      p = b;
05288                   else
05289                      p = c;
05290                 */
05291 
05292                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
05293 
05294                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
05295                rp++;
05296             }
05297          }  /* end !UseMMX_paeth */
05298          break;
05299 
05300       default:
05301          png_warning(png_ptr, "Ignoring bad row-filter type");
05302          *row=0;
05303          break;
05304    }
05305 }
05306 
05307 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
05308 
05309 
05310 /*===========================================================================*/
05311 /*                                                                           */
05312 /*                      P N G _ M M X _ S U P P O R T                        */
05313 /*                                                                           */
05314 /*===========================================================================*/
05315 
05316 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
05317  *             (2) all instructions compile with gcc 2.7.2.3 and later
05318  *             (3) the function is moved down here to prevent gcc from
05319  *                  inlining it in multiple places and then barfing be-
05320  *                  cause the ".NOT_SUPPORTED" label is multiply defined
05321  *             [is there a way to signal that a *single* function should
05322  *              not be inlined?  is there a way to modify the label for
05323  *              each inlined instance, e.g., by appending _1, _2, etc.?
05324  *              maybe if don't use leading "." in label name? (nope...sigh)]
05325  */
05326 
05327 int PNGAPI
05328 png_mmx_support(void)
05329 {
05330 #if defined(PNG_MMX_CODE_SUPPORTED)
05331     __asm__ __volatile__ (
05332         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
05333         "pushl %%ecx          \n\t"  // so does ecx...
05334         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
05335 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
05336 //      "pushf                \n\t"  // 16-bit pushf
05337         "pushfl               \n\t"  // save Eflag to stack
05338         "popl %%eax           \n\t"  // get Eflag from stack into eax
05339         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
05340         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
05341         "pushl %%eax          \n\t"  // save modified Eflag back to stack
05342 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
05343 //      "popf                 \n\t"  // 16-bit popf
05344         "popfl                \n\t"  // restore modified value to Eflag reg
05345         "pushfl               \n\t"  // save Eflag to stack
05346         "popl %%eax           \n\t"  // get Eflag from stack
05347         "pushl %%ecx          \n\t"  // save original Eflag to stack
05348         "popfl                \n\t"  // restore original Eflag
05349         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
05350         "jz 0f                \n\t"  // if same, CPUID instr. is not supported
05351 
05352         "xorl %%eax, %%eax    \n\t"  // set eax to zero
05353 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
05354         "cpuid                \n\t"  // get the CPU identification info
05355         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
05356         "jl 0f                \n\t"  // if eax is zero, MMX is not supported
05357 
05358         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
05359         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
05360                                      // faster than the instruction "mov eax, 1"
05361         "cpuid                \n\t"  // get the CPU identification info again
05362         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
05363         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
05364         "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
05365 
05366         "movl $1, %%eax       \n\t"  // set return value to 1
05367         "jmp  1f              \n\t"  // DONE:  have MMX support
05368 
05369     "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
05370         "movl $0, %%eax       \n\t"  // set return value to 0
05371     "1:                       \n\t"  // .RETURN: target label for jump instructions
05372         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
05373         "popl %%edx           \n\t"  // restore edx
05374         "popl %%ecx           \n\t"  // restore ecx
05375         "popl %%ebx           \n\t"  // restore ebx
05376 
05377 //      "ret                  \n\t"  // DONE:  no MMX support
05378                                      // (fall through to standard C "ret")
05379 
05380         :                            // output list (none)
05381 
05382         :                            // any variables used on input (none)
05383 
05384         : "%eax"                     // clobber list
05385 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
05386 //      , "memory"   // if write to a variable gcc thought was in a reg
05387 //      , "cc"       // "condition codes" (flag bits)
05388     );
05389 #else     
05390     _mmx_supported = 0;
05391 #endif /* PNG_MMX_CODE_SUPPORTED */
05392 
05393     return _mmx_supported;
05394 }
05395 
05396 
05397 #endif /* PNG_USE_PNGGCCRD */