Back to index

plt-scheme  4.2.1
pngvcrd.c
Go to the documentation of this file.
00001 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
00002  *
00003  * For Intel x86 CPU and Microsoft Visual C++ compiler
00004  *
00005  * libpng version 1.2.5 - October 3, 2002
00006  * For conditions of distribution and use, see copyright notice in png.h
00007  * Copyright (c) 1998-2002 Glenn Randers-Pehrson
00008  * Copyright (c) 1998, Intel Corporation
00009  *
00010  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
00011  * Interface to libpng contributed by Gilles Vollant, 1999
00012  *
00013  *
00014  * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
00015  * a sign error in the post-MMX cleanup code for each pixel_depth resulted
00016  * in bad pixels at the beginning of some rows of some images, and also
00017  * (due to out-of-range memory reads and writes) caused heap corruption
00018  * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
00019  *
00020  * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
00021  *
00022  * [runtime MMX configuration, GRR 20010102]
00023  *
00024  */
00025 
00026 #define PNG_INTERNAL
00027 #include "png.h"
00028 
00029 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
00030 
00031 static int mmx_supported=2;
00032 
00033 
00034 int PNGAPI
00035 png_mmx_support(void)
00036 {
00037   int mmx_supported_local = 0;
00038   _asm {
00039     push ebx          //CPUID will trash these
00040     push ecx
00041     push edx
00042 
00043     pushfd            //Save Eflag to stack
00044     pop eax           //Get Eflag from stack into eax
00045     mov ecx, eax      //Make another copy of Eflag in ecx
00046     xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
00047     push eax          //Save modified Eflag back to stack
00048 
00049     popfd             //Restored modified value back to Eflag reg
00050     pushfd            //Save Eflag to stack
00051     pop eax           //Get Eflag from stack
00052     push ecx          // save original Eflag to stack
00053     popfd             // restore original Eflag
00054     xor eax, ecx      //Compare the new Eflag with the original Eflag
00055     jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
00056                       //skip following instructions and jump to
00057                       //NOT_SUPPORTED label
00058 
00059     xor eax, eax      //Set eax to zero
00060 
00061     _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
00062     _asm _emit 0xa2
00063 
00064     cmp eax, 1        //make sure eax return non-zero value
00065     jl NOT_SUPPORTED  //If eax is zero, mmx not supported
00066 
00067     xor eax, eax      //set eax to zero
00068     inc eax           //Now increment eax to 1.  This instruction is
00069                       //faster than the instruction "mov eax, 1"
00070 
00071     _asm _emit 0x0f   //CPUID instruction
00072     _asm _emit 0xa2
00073 
00074     and edx, 0x00800000  //mask out all bits but mmx bit(24)
00075     cmp edx, 0        // 0 = mmx not supported
00076     jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
00077 
00078     mov  mmx_supported_local, 1  //set return value to 1
00079 
00080 NOT_SUPPORTED:
00081     mov  eax, mmx_supported_local  //move return value to eax
00082     pop edx          //CPUID trashed these
00083     pop ecx
00084     pop ebx
00085   }
00086 
00087   //mmx_supported_local=0; // test code for force don't support MMX
00088   //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
00089 
00090   mmx_supported = mmx_supported_local;
00091   return mmx_supported_local;
00092 }
00093 
00094 /* Combines the row recently read in with the previous row.
00095    This routine takes care of alpha and transparency if requested.
00096    This routine also handles the two methods of progressive display
00097    of interlaced images, depending on the mask value.
00098    The mask value describes which pixels are to be combined with
00099    the row.  The pattern always repeats every 8 pixels, so just 8
00100    bits are needed.  A one indicates the pixel is to be combined; a
00101    zero indicates the pixel is to be skipped.  This is in addition
00102    to any alpha or transparency value associated with the pixel.  If
00103    you want all pixels to be combined, pass 0xff (255) in mask.  */
00104 
00105 /* Use this routine for x86 platform - uses faster MMX routine if machine
00106    supports MMX */
00107 
00108 void /* PRIVATE */
00109 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
00110 {
00111 #ifdef PNG_USE_LOCAL_ARRAYS
00112    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
00113 #endif
00114 
00115    png_debug(1,"in png_combine_row_asm\n");
00116 
00117    if (mmx_supported == 2) {
00118        /* this should have happened in png_init_mmx_flags() already */
00119        png_warning(png_ptr, "asm_flags may not have been initialized");
00120        png_mmx_support();
00121    }
00122 
00123    if (mask == 0xff)
00124    {
00125       png_memcpy(row, png_ptr->row_buf + 1,
00126        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
00127    }
00128    /* GRR:  add "else if (mask == 0)" case?
00129     *       or does png_combine_row() not even get called in that case? */
00130    else
00131    {
00132       switch (png_ptr->row_info.pixel_depth)
00133       {
00134          case 1:
00135          {
00136             png_bytep sp;
00137             png_bytep dp;
00138             int s_inc, s_start, s_end;
00139             int m;
00140             int shift;
00141             png_uint_32 i;
00142 
00143             sp = png_ptr->row_buf + 1;
00144             dp = row;
00145             m = 0x80;
00146 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00147             if (png_ptr->transformations & PNG_PACKSWAP)
00148             {
00149                 s_start = 0;
00150                 s_end = 7;
00151                 s_inc = 1;
00152             }
00153             else
00154 #endif
00155             {
00156                 s_start = 7;
00157                 s_end = 0;
00158                 s_inc = -1;
00159             }
00160 
00161             shift = s_start;
00162 
00163             for (i = 0; i < png_ptr->width; i++)
00164             {
00165                if (m & mask)
00166                {
00167                   int value;
00168 
00169                   value = (*sp >> shift) & 0x1;
00170                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
00171                   *dp |= (png_byte)(value << shift);
00172                }
00173 
00174                if (shift == s_end)
00175                {
00176                   shift = s_start;
00177                   sp++;
00178                   dp++;
00179                }
00180                else
00181                   shift += s_inc;
00182 
00183                if (m == 1)
00184                   m = 0x80;
00185                else
00186                   m >>= 1;
00187             }
00188             break;
00189          }
00190 
00191          case 2:
00192          {
00193             png_bytep sp;
00194             png_bytep dp;
00195             int s_start, s_end, s_inc;
00196             int m;
00197             int shift;
00198             png_uint_32 i;
00199             int value;
00200 
00201             sp = png_ptr->row_buf + 1;
00202             dp = row;
00203             m = 0x80;
00204 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00205             if (png_ptr->transformations & PNG_PACKSWAP)
00206             {
00207                s_start = 0;
00208                s_end = 6;
00209                s_inc = 2;
00210             }
00211             else
00212 #endif
00213             {
00214                s_start = 6;
00215                s_end = 0;
00216                s_inc = -2;
00217             }
00218 
00219             shift = s_start;
00220 
00221             for (i = 0; i < png_ptr->width; i++)
00222             {
00223                if (m & mask)
00224                {
00225                   value = (*sp >> shift) & 0x3;
00226                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
00227                   *dp |= (png_byte)(value << shift);
00228                }
00229 
00230                if (shift == s_end)
00231                {
00232                   shift = s_start;
00233                   sp++;
00234                   dp++;
00235                }
00236                else
00237                   shift += s_inc;
00238                if (m == 1)
00239                   m = 0x80;
00240                else
00241                   m >>= 1;
00242             }
00243             break;
00244          }
00245 
00246          case 4:
00247          {
00248             png_bytep sp;
00249             png_bytep dp;
00250             int s_start, s_end, s_inc;
00251             int m;
00252             int shift;
00253             png_uint_32 i;
00254             int value;
00255 
00256             sp = png_ptr->row_buf + 1;
00257             dp = row;
00258             m = 0x80;
00259 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
00260             if (png_ptr->transformations & PNG_PACKSWAP)
00261             {
00262                s_start = 0;
00263                s_end = 4;
00264                s_inc = 4;
00265             }
00266             else
00267 #endif
00268             {
00269                s_start = 4;
00270                s_end = 0;
00271                s_inc = -4;
00272             }
00273             shift = s_start;
00274 
00275             for (i = 0; i < png_ptr->width; i++)
00276             {
00277                if (m & mask)
00278                {
00279                   value = (*sp >> shift) & 0xf;
00280                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
00281                   *dp |= (png_byte)(value << shift);
00282                }
00283 
00284                if (shift == s_end)
00285                {
00286                   shift = s_start;
00287                   sp++;
00288                   dp++;
00289                }
00290                else
00291                   shift += s_inc;
00292                if (m == 1)
00293                   m = 0x80;
00294                else
00295                   m >>= 1;
00296             }
00297             break;
00298          }
00299 
00300          case 8:
00301          {
00302             png_bytep srcptr;
00303             png_bytep dstptr;
00304             png_uint_32 len;
00305             int m;
00306             int diff, unmask;
00307 
00308             __int64 mask0=0x0102040810204080;
00309 
00310             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00311                 /* && mmx_supported */ )
00312             {
00313                srcptr = png_ptr->row_buf + 1;
00314                dstptr = row;
00315                m = 0x80;
00316                unmask = ~mask;
00317                len  = png_ptr->width &~7;  //reduce to multiple of 8
00318                diff = png_ptr->width & 7;  //amount lost
00319 
00320                _asm
00321                {
00322                   movd       mm7, unmask   //load bit pattern
00323                   psubb      mm6,mm6       //zero mm6
00324                   punpcklbw  mm7,mm7
00325                   punpcklwd  mm7,mm7
00326                   punpckldq  mm7,mm7       //fill register with 8 masks
00327 
00328                   movq       mm0,mask0
00329 
00330                   pand       mm0,mm7       //nonzero if keep byte
00331                   pcmpeqb    mm0,mm6       //zeros->1s, v versa
00332 
00333                   mov        ecx,len       //load length of line (pixels)
00334                   mov        esi,srcptr    //load source
00335                   mov        ebx,dstptr    //load dest
00336                   cmp        ecx,0         //lcr
00337                   je         mainloop8end
00338 
00339 mainloop8:
00340                   movq       mm4,[esi]
00341                   pand       mm4,mm0
00342                   movq       mm6,mm0
00343                   pandn      mm6,[ebx]
00344                   por        mm4,mm6
00345                   movq       [ebx],mm4
00346 
00347                   add        esi,8         //inc by 8 bytes processed
00348                   add        ebx,8
00349                   sub        ecx,8         //dec by 8 pixels processed
00350 
00351                   ja         mainloop8
00352 mainloop8end:
00353 
00354                   mov        ecx,diff
00355                   cmp        ecx,0
00356                   jz         end8
00357 
00358                   mov        edx,mask
00359                   sal        edx,24        //make low byte the high byte
00360 
00361 secondloop8:
00362                   sal        edx,1         //move high bit to CF
00363                   jnc        skip8         //if CF = 0
00364                   mov        al,[esi]
00365                   mov        [ebx],al
00366 skip8:
00367                   inc        esi
00368                   inc        ebx
00369 
00370                   dec        ecx
00371                   jnz        secondloop8
00372 end8:
00373                   emms
00374                }
00375             }
00376             else /* mmx not supported - use modified C routine */
00377             {
00378                register unsigned int incr1, initial_val, final_val;
00379                png_size_t pixel_bytes;
00380                png_uint_32 i;
00381                register int disp = png_pass_inc[png_ptr->pass];
00382                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00383 
00384                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00385                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00386                   pixel_bytes;
00387                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00388                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00389                final_val = png_ptr->width*pixel_bytes;
00390                incr1 = (disp)*pixel_bytes;
00391                for (i = initial_val; i < final_val; i += incr1)
00392                {
00393                   png_memcpy(dstptr, srcptr, pixel_bytes);
00394                   srcptr += incr1;
00395                   dstptr += incr1;
00396                }
00397             } /* end of else */
00398 
00399             break;
00400          }       // end 8 bpp
00401 
00402          case 16:
00403          {
00404             png_bytep srcptr;
00405             png_bytep dstptr;
00406             png_uint_32 len;
00407             int unmask, diff;
00408             __int64 mask1=0x0101020204040808,
00409                     mask0=0x1010202040408080;
00410 
00411             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00412                 /* && mmx_supported */ )
00413             {
00414                srcptr = png_ptr->row_buf + 1;
00415                dstptr = row;
00416 
00417                unmask = ~mask;
00418                len     = (png_ptr->width)&~7;
00419                diff = (png_ptr->width)&7;
00420                _asm
00421                {
00422                   movd       mm7, unmask       //load bit pattern
00423                   psubb      mm6,mm6           //zero mm6
00424                   punpcklbw  mm7,mm7
00425                   punpcklwd  mm7,mm7
00426                   punpckldq  mm7,mm7           //fill register with 8 masks
00427 
00428                   movq       mm0,mask0
00429                   movq       mm1,mask1
00430 
00431                   pand       mm0,mm7
00432                   pand       mm1,mm7
00433 
00434                   pcmpeqb    mm0,mm6
00435                   pcmpeqb    mm1,mm6
00436 
00437                   mov        ecx,len           //load length of line
00438                   mov        esi,srcptr        //load source
00439                   mov        ebx,dstptr        //load dest
00440                   cmp        ecx,0             //lcr
00441                   jz         mainloop16end
00442 
00443 mainloop16:
00444                   movq       mm4,[esi]
00445                   pand       mm4,mm0
00446                   movq       mm6,mm0
00447                   movq       mm7,[ebx]
00448                   pandn      mm6,mm7
00449                   por        mm4,mm6
00450                   movq       [ebx],mm4
00451 
00452                   movq       mm5,[esi+8]
00453                   pand       mm5,mm1
00454                   movq       mm7,mm1
00455                   movq       mm6,[ebx+8]
00456                   pandn      mm7,mm6
00457                   por        mm5,mm7
00458                   movq       [ebx+8],mm5
00459 
00460                   add        esi,16            //inc by 16 bytes processed
00461                   add        ebx,16
00462                   sub        ecx,8             //dec by 8 pixels processed
00463 
00464                   ja         mainloop16
00465 
00466 mainloop16end:
00467                   mov        ecx,diff
00468                   cmp        ecx,0
00469                   jz         end16
00470 
00471                   mov        edx,mask
00472                   sal        edx,24            //make low byte the high byte
00473 secondloop16:
00474                   sal        edx,1             //move high bit to CF
00475                   jnc        skip16            //if CF = 0
00476                   mov        ax,[esi]
00477                   mov        [ebx],ax
00478 skip16:
00479                   add        esi,2
00480                   add        ebx,2
00481 
00482                   dec        ecx
00483                   jnz        secondloop16
00484 end16:
00485                   emms
00486                }
00487             }
00488             else /* mmx not supported - use modified C routine */
00489             {
00490                register unsigned int incr1, initial_val, final_val;
00491                png_size_t pixel_bytes;
00492                png_uint_32 i;
00493                register int disp = png_pass_inc[png_ptr->pass];
00494                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00495 
00496                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00497                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00498                   pixel_bytes;
00499                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00500                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00501                final_val = png_ptr->width*pixel_bytes;
00502                incr1 = (disp)*pixel_bytes;
00503                for (i = initial_val; i < final_val; i += incr1)
00504                {
00505                   png_memcpy(dstptr, srcptr, pixel_bytes);
00506                   srcptr += incr1;
00507                   dstptr += incr1;
00508                }
00509             } /* end of else */
00510 
00511             break;
00512          }       // end 16 bpp
00513 
00514          case 24:
00515          {
00516             png_bytep srcptr;
00517             png_bytep dstptr;
00518             png_uint_32 len;
00519             int unmask, diff;
00520 
00521             __int64 mask2=0x0101010202020404,  //24bpp
00522                     mask1=0x0408080810101020,
00523                     mask0=0x2020404040808080;
00524 
00525             srcptr = png_ptr->row_buf + 1;
00526             dstptr = row;
00527 
00528             unmask = ~mask;
00529             len     = (png_ptr->width)&~7;
00530             diff = (png_ptr->width)&7;
00531 
00532             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00533                 /* && mmx_supported */ )
00534             {
00535                _asm
00536                {
00537                   movd       mm7, unmask       //load bit pattern
00538                   psubb      mm6,mm6           //zero mm6
00539                   punpcklbw  mm7,mm7
00540                   punpcklwd  mm7,mm7
00541                   punpckldq  mm7,mm7           //fill register with 8 masks
00542 
00543                   movq       mm0,mask0
00544                   movq       mm1,mask1
00545                   movq       mm2,mask2
00546 
00547                   pand       mm0,mm7
00548                   pand       mm1,mm7
00549                   pand       mm2,mm7
00550 
00551                   pcmpeqb    mm0,mm6
00552                   pcmpeqb    mm1,mm6
00553                   pcmpeqb    mm2,mm6
00554 
00555                   mov        ecx,len           //load length of line
00556                   mov        esi,srcptr        //load source
00557                   mov        ebx,dstptr        //load dest
00558                   cmp        ecx,0
00559                   jz         mainloop24end
00560 
00561 mainloop24:
00562                   movq       mm4,[esi]
00563                   pand       mm4,mm0
00564                   movq       mm6,mm0
00565                   movq       mm7,[ebx]
00566                   pandn      mm6,mm7
00567                   por        mm4,mm6
00568                   movq       [ebx],mm4
00569 
00570 
00571                   movq       mm5,[esi+8]
00572                   pand       mm5,mm1
00573                   movq       mm7,mm1
00574                   movq       mm6,[ebx+8]
00575                   pandn      mm7,mm6
00576                   por        mm5,mm7
00577                   movq       [ebx+8],mm5
00578 
00579                   movq       mm6,[esi+16]
00580                   pand       mm6,mm2
00581                   movq       mm4,mm2
00582                   movq       mm7,[ebx+16]
00583                   pandn      mm4,mm7
00584                   por        mm6,mm4
00585                   movq       [ebx+16],mm6
00586 
00587                   add        esi,24            //inc by 24 bytes processed
00588                   add        ebx,24
00589                   sub        ecx,8             //dec by 8 pixels processed
00590 
00591                   ja         mainloop24
00592 
00593 mainloop24end:
00594                   mov        ecx,diff
00595                   cmp        ecx,0
00596                   jz         end24
00597 
00598                   mov        edx,mask
00599                   sal        edx,24            //make low byte the high byte
00600 secondloop24:
00601                   sal        edx,1             //move high bit to CF
00602                   jnc        skip24            //if CF = 0
00603                   mov        ax,[esi]
00604                   mov        [ebx],ax
00605                   xor        eax,eax
00606                   mov        al,[esi+2]
00607                   mov        [ebx+2],al
00608 skip24:
00609                   add        esi,3
00610                   add        ebx,3
00611 
00612                   dec        ecx
00613                   jnz        secondloop24
00614 
00615 end24:
00616                   emms
00617                }
00618             }
00619             else /* mmx not supported - use modified C routine */
00620             {
00621                register unsigned int incr1, initial_val, final_val;
00622                png_size_t pixel_bytes;
00623                png_uint_32 i;
00624                register int disp = png_pass_inc[png_ptr->pass];
00625                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00626 
00627                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00628                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00629                   pixel_bytes;
00630                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00631                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00632                final_val = png_ptr->width*pixel_bytes;
00633                incr1 = (disp)*pixel_bytes;
00634                for (i = initial_val; i < final_val; i += incr1)
00635                {
00636                   png_memcpy(dstptr, srcptr, pixel_bytes);
00637                   srcptr += incr1;
00638                   dstptr += incr1;
00639                }
00640             } /* end of else */
00641 
00642             break;
00643          }       // end 24 bpp
00644 
00645          case 32:
00646          {
00647             png_bytep srcptr;
00648             png_bytep dstptr;
00649             png_uint_32 len;
00650             int unmask, diff;
00651 
00652             __int64 mask3=0x0101010102020202,  //32bpp
00653                     mask2=0x0404040408080808,
00654                     mask1=0x1010101020202020,
00655                     mask0=0x4040404080808080;
00656 
00657             srcptr = png_ptr->row_buf + 1;
00658             dstptr = row;
00659 
00660             unmask = ~mask;
00661             len     = (png_ptr->width)&~7;
00662             diff = (png_ptr->width)&7;
00663 
00664             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00665                 /* && mmx_supported */ )
00666             {
00667                _asm
00668                {
00669                   movd       mm7, unmask       //load bit pattern
00670                   psubb      mm6,mm6           //zero mm6
00671                   punpcklbw  mm7,mm7
00672                   punpcklwd  mm7,mm7
00673                   punpckldq  mm7,mm7           //fill register with 8 masks
00674 
00675                   movq       mm0,mask0
00676                   movq       mm1,mask1
00677                   movq       mm2,mask2
00678                   movq       mm3,mask3
00679 
00680                   pand       mm0,mm7
00681                   pand       mm1,mm7
00682                   pand       mm2,mm7
00683                   pand       mm3,mm7
00684 
00685                   pcmpeqb    mm0,mm6
00686                   pcmpeqb    mm1,mm6
00687                   pcmpeqb    mm2,mm6
00688                   pcmpeqb    mm3,mm6
00689 
00690                   mov        ecx,len           //load length of line
00691                   mov        esi,srcptr        //load source
00692                   mov        ebx,dstptr        //load dest
00693 
00694                   cmp        ecx,0             //lcr
00695                   jz         mainloop32end
00696 
00697 mainloop32:
00698                   movq       mm4,[esi]
00699                   pand       mm4,mm0
00700                   movq       mm6,mm0
00701                   movq       mm7,[ebx]
00702                   pandn      mm6,mm7
00703                   por        mm4,mm6
00704                   movq       [ebx],mm4
00705 
00706                   movq       mm5,[esi+8]
00707                   pand       mm5,mm1
00708                   movq       mm7,mm1
00709                   movq       mm6,[ebx+8]
00710                   pandn      mm7,mm6
00711                   por        mm5,mm7
00712                   movq       [ebx+8],mm5
00713 
00714                   movq       mm6,[esi+16]
00715                   pand       mm6,mm2
00716                   movq       mm4,mm2
00717                   movq       mm7,[ebx+16]
00718                   pandn      mm4,mm7
00719                   por        mm6,mm4
00720                   movq       [ebx+16],mm6
00721 
00722                   movq       mm7,[esi+24]
00723                   pand       mm7,mm3
00724                   movq       mm5,mm3
00725                   movq       mm4,[ebx+24]
00726                   pandn      mm5,mm4
00727                   por        mm7,mm5
00728                   movq       [ebx+24],mm7
00729 
00730                   add        esi,32            //inc by 32 bytes processed
00731                   add        ebx,32
00732                   sub        ecx,8             //dec by 8 pixels processed
00733 
00734                   ja         mainloop32
00735 
00736 mainloop32end:
00737                   mov        ecx,diff
00738                   cmp        ecx,0
00739                   jz         end32
00740 
00741                   mov        edx,mask
00742                   sal        edx,24            //make low byte the high byte
00743 secondloop32:
00744                   sal        edx,1             //move high bit to CF
00745                   jnc        skip32            //if CF = 0
00746                   mov        eax,[esi]
00747                   mov        [ebx],eax
00748 skip32:
00749                   add        esi,4
00750                   add        ebx,4
00751 
00752                   dec        ecx
00753                   jnz        secondloop32
00754 
00755 end32:
00756                   emms
00757                }
00758             }
00759             else /* mmx _not supported - Use modified C routine */
00760             {
00761                register unsigned int incr1, initial_val, final_val;
00762                png_size_t pixel_bytes;
00763                png_uint_32 i;
00764                register int disp = png_pass_inc[png_ptr->pass];
00765                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00766 
00767                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00768                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00769                   pixel_bytes;
00770                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00771                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00772                final_val = png_ptr->width*pixel_bytes;
00773                incr1 = (disp)*pixel_bytes;
00774                for (i = initial_val; i < final_val; i += incr1)
00775                {
00776                   png_memcpy(dstptr, srcptr, pixel_bytes);
00777                   srcptr += incr1;
00778                   dstptr += incr1;
00779                }
00780             } /* end of else */
00781 
00782             break;
00783          }       // end 32 bpp
00784 
00785          case 48:
00786          {
00787             png_bytep srcptr;
00788             png_bytep dstptr;
00789             png_uint_32 len;
00790             int unmask, diff;
00791 
00792             __int64 mask5=0x0101010101010202,
00793                     mask4=0x0202020204040404,
00794                     mask3=0x0404080808080808,
00795                     mask2=0x1010101010102020,
00796                     mask1=0x2020202040404040,
00797                     mask0=0x4040808080808080;
00798 
00799             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
00800                 /* && mmx_supported */ )
00801             {
00802                srcptr = png_ptr->row_buf + 1;
00803                dstptr = row;
00804 
00805                unmask = ~mask;
00806                len     = (png_ptr->width)&~7;
00807                diff = (png_ptr->width)&7;
00808                _asm
00809                {
00810                   movd       mm7, unmask       //load bit pattern
00811                   psubb      mm6,mm6           //zero mm6
00812                   punpcklbw  mm7,mm7
00813                   punpcklwd  mm7,mm7
00814                   punpckldq  mm7,mm7           //fill register with 8 masks
00815 
00816                   movq       mm0,mask0
00817                   movq       mm1,mask1
00818                   movq       mm2,mask2
00819                   movq       mm3,mask3
00820                   movq       mm4,mask4
00821                   movq       mm5,mask5
00822 
00823                   pand       mm0,mm7
00824                   pand       mm1,mm7
00825                   pand       mm2,mm7
00826                   pand       mm3,mm7
00827                   pand       mm4,mm7
00828                   pand       mm5,mm7
00829 
00830                   pcmpeqb    mm0,mm6
00831                   pcmpeqb    mm1,mm6
00832                   pcmpeqb    mm2,mm6
00833                   pcmpeqb    mm3,mm6
00834                   pcmpeqb    mm4,mm6
00835                   pcmpeqb    mm5,mm6
00836 
00837                   mov        ecx,len           //load length of line
00838                   mov        esi,srcptr        //load source
00839                   mov        ebx,dstptr        //load dest
00840 
00841                   cmp        ecx,0
00842                   jz         mainloop48end
00843 
00844 mainloop48:
00845                   movq       mm7,[esi]
00846                   pand       mm7,mm0
00847                   movq       mm6,mm0
00848                   pandn      mm6,[ebx]
00849                   por        mm7,mm6
00850                   movq       [ebx],mm7
00851 
00852                   movq       mm6,[esi+8]
00853                   pand       mm6,mm1
00854                   movq       mm7,mm1
00855                   pandn      mm7,[ebx+8]
00856                   por        mm6,mm7
00857                   movq       [ebx+8],mm6
00858 
00859                   movq       mm6,[esi+16]
00860                   pand       mm6,mm2
00861                   movq       mm7,mm2
00862                   pandn      mm7,[ebx+16]
00863                   por        mm6,mm7
00864                   movq       [ebx+16],mm6
00865 
00866                   movq       mm7,[esi+24]
00867                   pand       mm7,mm3
00868                   movq       mm6,mm3
00869                   pandn      mm6,[ebx+24]
00870                   por        mm7,mm6
00871                   movq       [ebx+24],mm7
00872 
00873                   movq       mm6,[esi+32]
00874                   pand       mm6,mm4
00875                   movq       mm7,mm4
00876                   pandn      mm7,[ebx+32]
00877                   por        mm6,mm7
00878                   movq       [ebx+32],mm6
00879 
00880                   movq       mm7,[esi+40]
00881                   pand       mm7,mm5
00882                   movq       mm6,mm5
00883                   pandn      mm6,[ebx+40]
00884                   por        mm7,mm6
00885                   movq       [ebx+40],mm7
00886 
00887                   add        esi,48            //inc by 32 bytes processed
00888                   add        ebx,48
00889                   sub        ecx,8             //dec by 8 pixels processed
00890 
00891                   ja         mainloop48
00892 mainloop48end:
00893 
00894                   mov        ecx,diff
00895                   cmp        ecx,0
00896                   jz         end48
00897 
00898                   mov        edx,mask
00899                   sal        edx,24            //make low byte the high byte
00900 
00901 secondloop48:
00902                   sal        edx,1             //move high bit to CF
00903                   jnc        skip48            //if CF = 0
00904                   mov        eax,[esi]
00905                   mov        [ebx],eax
00906 skip48:
00907                   add        esi,4
00908                   add        ebx,4
00909 
00910                   dec        ecx
00911                   jnz        secondloop48
00912 
00913 end48:
00914                   emms
00915                }
00916             }
00917             else /* mmx _not supported - Use modified C routine */
00918             {
00919                register unsigned int incr1, initial_val, final_val;
00920                png_size_t pixel_bytes;
00921                png_uint_32 i;
00922                register int disp = png_pass_inc[png_ptr->pass];
00923                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00924 
00925                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00926                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00927                   pixel_bytes;
00928                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
00929                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00930                final_val = png_ptr->width*pixel_bytes;
00931                incr1 = (disp)*pixel_bytes;
00932                for (i = initial_val; i < final_val; i += incr1)
00933                {
00934                   png_memcpy(dstptr, srcptr, pixel_bytes);
00935                   srcptr += incr1;
00936                   dstptr += incr1;
00937                }
00938             } /* end of else */
00939 
00940             break;
00941          }       // end 48 bpp
00942 
00943          default:
00944          {
00945             png_bytep sptr;
00946             png_bytep dp;
00947             png_size_t pixel_bytes;
00948             int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
00949             unsigned int i;
00950             register int disp = png_pass_inc[png_ptr->pass];  // get the offset
00951             register unsigned int incr1, initial_val, final_val;
00952 
00953             pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
00954             sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
00955                pixel_bytes;
00956             dp = row + offset_table[png_ptr->pass]*pixel_bytes;
00957             initial_val = offset_table[png_ptr->pass]*pixel_bytes;
00958             final_val = png_ptr->width*pixel_bytes;
00959             incr1 = (disp)*pixel_bytes;
00960             for (i = initial_val; i < final_val; i += incr1)
00961             {
00962                png_memcpy(dp, sptr, pixel_bytes);
00963                sptr += incr1;
00964                dp += incr1;
00965             }
00966             break;
00967          }
00968       } /* end switch (png_ptr->row_info.pixel_depth) */
00969    } /* end if (non-trivial mask) */
00970 
00971 } /* end png_combine_row() */
00972 
00973 
00974 #if defined(PNG_READ_INTERLACING_SUPPORTED)
00975 
00976 void /* PRIVATE */
00977 png_do_read_interlace(png_structp png_ptr)
00978 {
00979    png_row_infop row_info = &(png_ptr->row_info);
00980    png_bytep row = png_ptr->row_buf + 1;
00981    int pass = png_ptr->pass;
00982    png_uint_32 transformations = png_ptr->transformations;
00983 #ifdef PNG_USE_LOCAL_ARRAYS
00984    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
00985 #endif
00986 
00987    png_debug(1,"in png_do_read_interlace\n");
00988 
00989    if (mmx_supported == 2) {
00990        /* this should have happened in png_init_mmx_flags() already */
00991        png_warning(png_ptr, "asm_flags may not have been initialized");
00992        png_mmx_support();
00993    }
00994 
00995    if (row != NULL && row_info != NULL)
00996    {
00997       png_uint_32 final_width;
00998 
00999       final_width = row_info->width * png_pass_inc[pass];
01000 
01001       switch (row_info->pixel_depth)
01002       {
01003          case 1:
01004          {
01005             png_bytep sp, dp;
01006             int sshift, dshift;
01007             int s_start, s_end, s_inc;
01008             png_byte v;
01009             png_uint_32 i;
01010             int j;
01011 
01012             sp = row + (png_size_t)((row_info->width - 1) >> 3);
01013             dp = row + (png_size_t)((final_width - 1) >> 3);
01014 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01015             if (transformations & PNG_PACKSWAP)
01016             {
01017                sshift = (int)((row_info->width + 7) & 7);
01018                dshift = (int)((final_width + 7) & 7);
01019                s_start = 7;
01020                s_end = 0;
01021                s_inc = -1;
01022             }
01023             else
01024 #endif
01025             {
01026                sshift = 7 - (int)((row_info->width + 7) & 7);
01027                dshift = 7 - (int)((final_width + 7) & 7);
01028                s_start = 0;
01029                s_end = 7;
01030                s_inc = 1;
01031             }
01032 
01033             for (i = row_info->width; i; i--)
01034             {
01035                v = (png_byte)((*sp >> sshift) & 0x1);
01036                for (j = 0; j < png_pass_inc[pass]; j++)
01037                {
01038                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
01039                   *dp |= (png_byte)(v << dshift);
01040                   if (dshift == s_end)
01041                   {
01042                      dshift = s_start;
01043                      dp--;
01044                   }
01045                   else
01046                      dshift += s_inc;
01047                }
01048                if (sshift == s_end)
01049                {
01050                   sshift = s_start;
01051                   sp--;
01052                }
01053                else
01054                   sshift += s_inc;
01055             }
01056             break;
01057          }
01058 
01059          case 2:
01060          {
01061             png_bytep sp, dp;
01062             int sshift, dshift;
01063             int s_start, s_end, s_inc;
01064             png_uint_32 i;
01065 
01066             sp = row + (png_size_t)((row_info->width - 1) >> 2);
01067             dp = row + (png_size_t)((final_width - 1) >> 2);
01068 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01069             if (transformations & PNG_PACKSWAP)
01070             {
01071                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
01072                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
01073                s_start = 6;
01074                s_end = 0;
01075                s_inc = -2;
01076             }
01077             else
01078 #endif
01079             {
01080                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
01081                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
01082                s_start = 0;
01083                s_end = 6;
01084                s_inc = 2;
01085             }
01086 
01087             for (i = row_info->width; i; i--)
01088             {
01089                png_byte v;
01090                int j;
01091 
01092                v = (png_byte)((*sp >> sshift) & 0x3);
01093                for (j = 0; j < png_pass_inc[pass]; j++)
01094                {
01095                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
01096                   *dp |= (png_byte)(v << dshift);
01097                   if (dshift == s_end)
01098                   {
01099                      dshift = s_start;
01100                      dp--;
01101                   }
01102                   else
01103                      dshift += s_inc;
01104                }
01105                if (sshift == s_end)
01106                {
01107                   sshift = s_start;
01108                   sp--;
01109                }
01110                else
01111                   sshift += s_inc;
01112             }
01113             break;
01114          }
01115 
01116          case 4:
01117          {
01118             png_bytep sp, dp;
01119             int sshift, dshift;
01120             int s_start, s_end, s_inc;
01121             png_uint_32 i;
01122 
01123             sp = row + (png_size_t)((row_info->width - 1) >> 1);
01124             dp = row + (png_size_t)((final_width - 1) >> 1);
01125 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
01126             if (transformations & PNG_PACKSWAP)
01127             {
01128                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
01129                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
01130                s_start = 4;
01131                s_end = 0;
01132                s_inc = -4;
01133             }
01134             else
01135 #endif
01136             {
01137                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
01138                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
01139                s_start = 0;
01140                s_end = 4;
01141                s_inc = 4;
01142             }
01143 
01144             for (i = row_info->width; i; i--)
01145             {
01146                png_byte v;
01147                int j;
01148 
01149                v = (png_byte)((*sp >> sshift) & 0xf);
01150                for (j = 0; j < png_pass_inc[pass]; j++)
01151                {
01152                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
01153                   *dp |= (png_byte)(v << dshift);
01154                   if (dshift == s_end)
01155                   {
01156                      dshift = s_start;
01157                      dp--;
01158                   }
01159                   else
01160                      dshift += s_inc;
01161                }
01162                if (sshift == s_end)
01163                {
01164                   sshift = s_start;
01165                   sp--;
01166                }
01167                else
01168                   sshift += s_inc;
01169             }
01170             break;
01171          }
01172 
01173          default:         // This is the place where the routine is modified
01174          {
01175             __int64 const4 = 0x0000000000FFFFFF;
01176             // __int64 const5 = 0x000000FFFFFF0000;  // unused...
01177             __int64 const6 = 0x00000000000000FF;
01178             png_bytep sptr, dp;
01179             png_uint_32 i;
01180             png_size_t pixel_bytes;
01181             int width = row_info->width;
01182 
01183             pixel_bytes = (row_info->pixel_depth >> 3);
01184 
01185             sptr = row + (width - 1) * pixel_bytes;
01186             dp = row + (final_width - 1) * pixel_bytes;
01187             // New code by Nirav Chhatrapati - Intel Corporation
01188             // sign fix by GRR
01189             // NOTE:  there is NO MMX code for 48-bit and 64-bit images
01190 
01191             // use MMX routine if machine supports it
01192             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
01193                 /* && mmx_supported */ )
01194             {
01195                if (pixel_bytes == 3)
01196                {
01197                   if (((pass == 0) || (pass == 1)) && width)
01198                   {
01199                      _asm
01200                      {
01201                         mov esi, sptr
01202                         mov edi, dp
01203                         mov ecx, width
01204                         sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
01205 loop_pass0:
01206                         movd mm0, [esi]     ; X X X X X v2 v1 v0
01207                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
01208                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
01209                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
01210                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
01211                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
01212                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
01213                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
01214                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
01215                         movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
01216                         psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
01217                         movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
01218                         punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
01219                         movq [edi+16] , mm4
01220                         psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
01221                         movq [edi+8] , mm3
01222                         punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
01223                         sub esi, 3
01224                         movq [edi], mm0
01225                         sub edi, 24
01226                         //sub esi, 3
01227                         dec ecx
01228                         jnz loop_pass0
01229                         EMMS
01230                      }
01231                   }
01232                   else if (((pass == 2) || (pass == 3)) && width)
01233                   {
01234                      _asm
01235                      {
01236                         mov esi, sptr
01237                         mov edi, dp
01238                         mov ecx, width
01239                         sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
01240 loop_pass2:
01241                         movd mm0, [esi]     ; X X X X X v2 v1 v0
01242                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
01243                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
01244                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
01245                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
01246                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
01247                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
01248                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
01249                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
01250                         movq [edi+4], mm0   ; move to memory
01251                         psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
01252                         movd [edi], mm0     ; move to memory
01253                         sub esi, 3
01254                         sub edi, 12
01255                         dec ecx
01256                         jnz loop_pass2
01257                         EMMS
01258                      }
01259                   }
01260                   else if (width) /* && ((pass == 4) || (pass == 5)) */
01261                   {
01262                      int width_mmx = ((width >> 1) << 1) - 8;
01263                      if (width_mmx < 0)
01264                          width_mmx = 0;
01265                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
01266                      if (width_mmx)
01267                      {
01268                         _asm
01269                         {
01270                            mov esi, sptr
01271                            mov edi, dp
01272                            mov ecx, width_mmx
01273                            sub esi, 3
01274                            sub edi, 9
01275 loop_pass4:
01276                            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
01277                            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
01278                            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
01279                            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
01280                            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
01281                            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
01282                            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
01283                            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
01284                            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
01285                            movq [edi], mm0     ; move quad to memory
01286                            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
01287                            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
01288                            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
01289                            movd [edi+8], mm6   ; move double to memory
01290                            sub esi, 6
01291                            sub edi, 12
01292                            sub ecx, 2
01293                            jnz loop_pass4
01294                            EMMS
01295                         }
01296                      }
01297 
01298                      sptr -= width_mmx*3;
01299                      dp -= width_mmx*6;
01300                      for (i = width; i; i--)
01301                      {
01302                         png_byte v[8];
01303                         int j;
01304 
01305                         png_memcpy(v, sptr, 3);
01306                         for (j = 0; j < png_pass_inc[pass]; j++)
01307                         {
01308                            png_memcpy(dp, v, 3);
01309                            dp -= 3;
01310                         }
01311                         sptr -= 3;
01312                      }
01313                   }
01314                } /* end of pixel_bytes == 3 */
01315 
01316                else if (pixel_bytes == 1)
01317                {
01318                   if (((pass == 0) || (pass == 1)) && width)
01319                   {
01320                      int width_mmx = ((width >> 2) << 2);
01321                      width -= width_mmx;
01322                      if (width_mmx)
01323                      {
01324                         _asm
01325                         {
01326                            mov esi, sptr
01327                            mov edi, dp
01328                            mov ecx, width_mmx
01329                            sub edi, 31
01330                            sub esi, 3
01331 loop1_pass0:
01332                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
01333                            movq mm1, mm0       ; X X X X v0 v1 v2 v3
01334                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
01335                            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
01336                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
01337                            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
01338                            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
01339                            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
01340                            movq [edi], mm0     ; move to memory v3
01341                            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
01342                            movq [edi+8], mm3   ; move to memory v2
01343                            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
01344                            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
01345                            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
01346                            movq [edi+16], mm2  ; move to memory v1
01347                            movq [edi+24], mm4  ; move to memory v0
01348                            sub esi, 4
01349                            sub edi, 32
01350                            sub ecx, 4
01351                            jnz loop1_pass0
01352                            EMMS
01353                         }
01354                      }
01355 
01356                      sptr -= width_mmx;
01357                      dp -= width_mmx*8;
01358                      for (i = width; i; i--)
01359                      {
01360                         int j;
01361 
01362                        /* I simplified this part in version 1.0.4e
01363                         * here and in several other instances where
01364                         * pixel_bytes == 1  -- GR-P
01365                         *
01366                         * Original code:
01367                         *
01368                         * png_byte v[8];
01369                         * png_memcpy(v, sptr, pixel_bytes);
01370                         * for (j = 0; j < png_pass_inc[pass]; j++)
01371                         * {
01372                         *    png_memcpy(dp, v, pixel_bytes);
01373                         *    dp -= pixel_bytes;
01374                         * }
01375                         * sptr -= pixel_bytes;
01376                         *
01377                         * Replacement code is in the next three lines:
01378                         */
01379 
01380                         for (j = 0; j < png_pass_inc[pass]; j++)
01381                            *dp-- = *sptr;
01382                         sptr--;
01383                      }
01384                   }
01385                   else if (((pass == 2) || (pass == 3)) && width)
01386                   {
01387                      int width_mmx = ((width >> 2) << 2);
01388                      width -= width_mmx;
01389                      if (width_mmx)
01390                      {
01391                         _asm
01392                         {
01393                            mov esi, sptr
01394                            mov edi, dp
01395                            mov ecx, width_mmx
01396                            sub edi, 15
01397                            sub esi, 3
01398 loop1_pass2:
01399                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
01400                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
01401                            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
01402                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
01403                            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
01404                            movq [edi], mm0     ; move to memory v2 and v3
01405                            sub esi, 4
01406                            movq [edi+8], mm1   ; move to memory v1     and v0
01407                            sub edi, 16
01408                            sub ecx, 4
01409                            jnz loop1_pass2
01410                            EMMS
01411                         }
01412                      }
01413 
01414                      sptr -= width_mmx;
01415                      dp -= width_mmx*4;
01416                      for (i = width; i; i--)
01417                      {
01418                         int j;
01419 
01420                         for (j = 0; j < png_pass_inc[pass]; j++)
01421                         {
01422                            *dp-- = *sptr;
01423                         }
01424                         sptr --;
01425                      }
01426                   }
01427                   else if (width) /* && ((pass == 4) || (pass == 5))) */
01428                   {
01429                      int width_mmx = ((width >> 3) << 3);
01430                      width -= width_mmx;
01431                      if (width_mmx)
01432                      {
01433                         _asm
01434                         {
01435                            mov esi, sptr
01436                            mov edi, dp
01437                            mov ecx, width_mmx
01438                            sub edi, 15
01439                            sub esi, 7
01440 loop1_pass4:
01441                            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
01442                            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
01443                            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
01444                            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
01445                            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
01446                            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
01447                            sub esi, 8
01448                            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
01449                            //sub esi, 4
01450                            sub edi, 16
01451                            sub ecx, 8
01452                            jnz loop1_pass4
01453                            EMMS
01454                         }
01455                      }
01456 
01457                      sptr -= width_mmx;
01458                      dp -= width_mmx*2;
01459                      for (i = width; i; i--)
01460                      {
01461                         int j;
01462 
01463                         for (j = 0; j < png_pass_inc[pass]; j++)
01464                         {
01465                            *dp-- = *sptr;
01466                         }
01467                         sptr --;
01468                      }
01469                   }
01470                } /* end of pixel_bytes == 1 */
01471 
01472                else if (pixel_bytes == 2)
01473                {
01474                   if (((pass == 0) || (pass == 1)) && width)
01475                   {
01476                      int width_mmx = ((width >> 1) << 1);
01477                      width -= width_mmx;
01478                      if (width_mmx)
01479                      {
01480                         _asm
01481                         {
01482                            mov esi, sptr
01483                            mov edi, dp
01484                            mov ecx, width_mmx
01485                            sub esi, 2
01486                            sub edi, 30
01487 loop2_pass0:
01488                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
01489                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
01490                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
01491                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
01492                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
01493                            movq [edi], mm0
01494                            movq [edi + 8], mm0
01495                            movq [edi + 16], mm1
01496                            movq [edi + 24], mm1
01497                            sub esi, 4
01498                            sub edi, 32
01499                            sub ecx, 2
01500                            jnz loop2_pass0
01501                            EMMS
01502                         }
01503                      }
01504 
01505                      sptr -= (width_mmx*2 - 2);            // sign fixed
01506                      dp -= (width_mmx*16 - 2);            // sign fixed
01507                      for (i = width; i; i--)
01508                      {
01509                         png_byte v[8];
01510                         int j;
01511                         sptr -= 2;
01512                         png_memcpy(v, sptr, 2);
01513                         for (j = 0; j < png_pass_inc[pass]; j++)
01514                         {
01515                            dp -= 2;
01516                            png_memcpy(dp, v, 2);
01517                         }
01518                      }
01519                   }
01520                   else if (((pass == 2) || (pass == 3)) && width)
01521                   {
01522                      int width_mmx = ((width >> 1) << 1) ;
01523                      width -= width_mmx;
01524                      if (width_mmx)
01525                      {
01526                         _asm
01527                         {
01528                            mov esi, sptr
01529                            mov edi, dp
01530                            mov ecx, width_mmx
01531                            sub esi, 2
01532                            sub edi, 14
01533 loop2_pass2:
01534                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
01535                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
01536                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
01537                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
01538                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
01539                            movq [edi], mm0
01540                            sub esi, 4
01541                            movq [edi + 8], mm1
01542                            //sub esi, 4
01543                            sub edi, 16
01544                            sub ecx, 2
01545                            jnz loop2_pass2
01546                            EMMS
01547                         }
01548                      }
01549 
01550                      sptr -= (width_mmx*2 - 2);            // sign fixed
01551                      dp -= (width_mmx*8 - 2);            // sign fixed
01552                      for (i = width; i; i--)
01553                      {
01554                         png_byte v[8];
01555                         int j;
01556                         sptr -= 2;
01557                         png_memcpy(v, sptr, 2);
01558                         for (j = 0; j < png_pass_inc[pass]; j++)
01559                         {
01560                            dp -= 2;
01561                            png_memcpy(dp, v, 2);
01562                         }
01563                      }
01564                   }
01565                   else if (width)  // pass == 4 or 5
01566                   {
01567                      int width_mmx = ((width >> 1) << 1) ;
01568                      width -= width_mmx;
01569                      if (width_mmx)
01570                      {
01571                         _asm
01572                         {
01573                            mov esi, sptr
01574                            mov edi, dp
01575                            mov ecx, width_mmx
01576                            sub esi, 2
01577                            sub edi, 6
01578 loop2_pass4:
01579                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
01580                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
01581                            sub esi, 4
01582                            movq [edi], mm0
01583                            sub edi, 8
01584                            sub ecx, 2
01585                            jnz loop2_pass4
01586                            EMMS
01587                         }
01588                      }
01589 
01590                      sptr -= (width_mmx*2 - 2);            // sign fixed
01591                      dp -= (width_mmx*4 - 2);            // sign fixed
01592                      for (i = width; i; i--)
01593                      {
01594                         png_byte v[8];
01595                         int j;
01596                         sptr -= 2;
01597                         png_memcpy(v, sptr, 2);
01598                         for (j = 0; j < png_pass_inc[pass]; j++)
01599                         {
01600                            dp -= 2;
01601                            png_memcpy(dp, v, 2);
01602                         }
01603                      }
01604                   }
01605                } /* end of pixel_bytes == 2 */
01606 
01607                else if (pixel_bytes == 4)
01608                {
01609                   if (((pass == 0) || (pass == 1)) && width)
01610                   {
01611                      int width_mmx = ((width >> 1) << 1) ;
01612                      width -= width_mmx;
01613                      if (width_mmx)
01614                      {
01615                         _asm
01616                         {
01617                            mov esi, sptr
01618                            mov edi, dp
01619                            mov ecx, width_mmx
01620                            sub esi, 4
01621                            sub edi, 60
01622 loop4_pass0:
01623                            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
01624                            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
01625                            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
01626                            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
01627                            movq [edi], mm0
01628                            movq [edi + 8], mm0
01629                            movq [edi + 16], mm0
01630                            movq [edi + 24], mm0
01631                            movq [edi+32], mm1
01632                            movq [edi + 40], mm1
01633                            movq [edi+ 48], mm1
01634                            sub esi, 8
01635                            movq [edi + 56], mm1
01636                            sub edi, 64
01637                            sub ecx, 2
01638                            jnz loop4_pass0
01639                            EMMS
01640                         }
01641                      }
01642 
01643                      sptr -= (width_mmx*4 - 4);            // sign fixed
01644                      dp -= (width_mmx*32 - 4);            // sign fixed
01645                      for (i = width; i; i--)
01646                      {
01647                         png_byte v[8];
01648                         int j;
01649                         sptr -= 4;
01650                         png_memcpy(v, sptr, 4);
01651                         for (j = 0; j < png_pass_inc[pass]; j++)
01652                         {
01653                            dp -= 4;
01654                            png_memcpy(dp, v, 4);
01655                         }
01656                      }
01657                   }
01658                   else if (((pass == 2) || (pass == 3)) && width)
01659                   {
01660                      int width_mmx = ((width >> 1) << 1) ;
01661                      width -= width_mmx;
01662                      if (width_mmx)
01663                      {
01664                         _asm
01665                         {
01666                            mov esi, sptr
01667                            mov edi, dp
01668                            mov ecx, width_mmx
01669                            sub esi, 4
01670                            sub edi, 28
01671 loop4_pass2:
01672                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
01673                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
01674                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
01675                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
01676                            movq [edi], mm0
01677                            movq [edi + 8], mm0
01678                            movq [edi+16], mm1
01679                            movq [edi + 24], mm1
01680                            sub esi, 8
01681                            sub edi, 32
01682                            sub ecx, 2
01683                            jnz loop4_pass2
01684                            EMMS
01685                         }
01686                      }
01687 
01688                      sptr -= (width_mmx*4 - 4);            // sign fixed
01689                      dp -= (width_mmx*16 - 4);            // sign fixed
01690                      for (i = width; i; i--)
01691                      {
01692                         png_byte v[8];
01693                         int j;
01694                         sptr -= 4;
01695                         png_memcpy(v, sptr, 4);
01696                         for (j = 0; j < png_pass_inc[pass]; j++)
01697                         {
01698                            dp -= 4;
01699                            png_memcpy(dp, v, 4);
01700                         }
01701                      }
01702                   }
01703                   else if (width)  // pass == 4 or 5
01704                   {
01705                      int width_mmx = ((width >> 1) << 1) ;
01706                      width -= width_mmx;
01707                      if (width_mmx)
01708                      {
01709                         _asm
01710                         {
01711                            mov esi, sptr
01712                            mov edi, dp
01713                            mov ecx, width_mmx
01714                            sub esi, 4
01715                            sub edi, 12
01716 loop4_pass4:
01717                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
01718                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
01719                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
01720                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
01721                            movq [edi], mm0
01722                            sub esi, 8
01723                            movq [edi + 8], mm1
01724                            sub edi, 16
01725                            sub ecx, 2
01726                            jnz loop4_pass4
01727                            EMMS
01728                         }
01729                      }
01730 
01731                      sptr -= (width_mmx*4 - 4);          // sign fixed
01732                      dp -= (width_mmx*8 - 4);            // sign fixed
01733                      for (i = width; i; i--)
01734                      {
01735                         png_byte v[8];
01736                         int j;
01737                         sptr -= 4;
01738                         png_memcpy(v, sptr, 4);
01739                         for (j = 0; j < png_pass_inc[pass]; j++)
01740                         {
01741                            dp -= 4;
01742                            png_memcpy(dp, v, 4);
01743                         }
01744                      }
01745                   }
01746 
01747                } /* end of pixel_bytes == 4 */
01748 
01749                else if (pixel_bytes == 6)
01750                {
01751                   for (i = width; i; i--)
01752                   {
01753                      png_byte v[8];
01754                      int j;
01755                      png_memcpy(v, sptr, 6);
01756                      for (j = 0; j < png_pass_inc[pass]; j++)
01757                      {
01758                         png_memcpy(dp, v, 6);
01759                         dp -= 6;
01760                      }
01761                      sptr -= 6;
01762                   }
01763                } /* end of pixel_bytes == 6 */
01764 
01765                else
01766                {
01767                   for (i = width; i; i--)
01768                   {
01769                      png_byte v[8];
01770                      int j;
01771                      png_memcpy(v, sptr, pixel_bytes);
01772                      for (j = 0; j < png_pass_inc[pass]; j++)
01773                      {
01774                         png_memcpy(dp, v, pixel_bytes);
01775                         dp -= pixel_bytes;
01776                      }
01777                      sptr-= pixel_bytes;
01778                   }
01779                }
01780             } /* end of mmx_supported */
01781 
01782             else /* MMX not supported:  use modified C code - takes advantage
01783                   * of inlining of memcpy for a constant */
01784             {
01785                if (pixel_bytes == 1)
01786                {
01787                   for (i = width; i; i--)
01788                   {
01789                      int j;
01790                      for (j = 0; j < png_pass_inc[pass]; j++)
01791                         *dp-- = *sptr;
01792                      sptr--;
01793                   }
01794                }
01795                else if (pixel_bytes == 3)
01796                {
01797                   for (i = width; i; i--)
01798                   {
01799                      png_byte v[8];
01800                      int j;
01801                      png_memcpy(v, sptr, pixel_bytes);
01802                      for (j = 0; j < png_pass_inc[pass]; j++)
01803                      {
01804                         png_memcpy(dp, v, pixel_bytes);
01805                         dp -= pixel_bytes;
01806                      }
01807                      sptr -= pixel_bytes;
01808                   }
01809                }
01810                else if (pixel_bytes == 2)
01811                {
01812                   for (i = width; i; i--)
01813                   {
01814                      png_byte v[8];
01815                      int j;
01816                      png_memcpy(v, sptr, pixel_bytes);
01817                      for (j = 0; j < png_pass_inc[pass]; j++)
01818                      {
01819                         png_memcpy(dp, v, pixel_bytes);
01820                         dp -= pixel_bytes;
01821                      }
01822                      sptr -= pixel_bytes;
01823                   }
01824                }
01825                else if (pixel_bytes == 4)
01826                {
01827                   for (i = width; i; i--)
01828                   {
01829                      png_byte v[8];
01830                      int j;
01831                      png_memcpy(v, sptr, pixel_bytes);
01832                      for (j = 0; j < png_pass_inc[pass]; j++)
01833                      {
01834                         png_memcpy(dp, v, pixel_bytes);
01835                         dp -= pixel_bytes;
01836                      }
01837                      sptr -= pixel_bytes;
01838                   }
01839                }
01840                else if (pixel_bytes == 6)
01841                {
01842                   for (i = width; i; i--)
01843                   {
01844                      png_byte v[8];
01845                      int j;
01846                      png_memcpy(v, sptr, pixel_bytes);
01847                      for (j = 0; j < png_pass_inc[pass]; j++)
01848                      {
01849                         png_memcpy(dp, v, pixel_bytes);
01850                         dp -= pixel_bytes;
01851                      }
01852                      sptr -= pixel_bytes;
01853                   }
01854                }
01855                else
01856                {
01857                   for (i = width; i; i--)
01858                   {
01859                      png_byte v[8];
01860                      int j;
01861                      png_memcpy(v, sptr, pixel_bytes);
01862                      for (j = 0; j < png_pass_inc[pass]; j++)
01863                      {
01864                         png_memcpy(dp, v, pixel_bytes);
01865                         dp -= pixel_bytes;
01866                      }
01867                      sptr -= pixel_bytes;
01868                   }
01869                }
01870 
01871             } /* end of MMX not supported */
01872             break;
01873          }
01874       } /* end switch (row_info->pixel_depth) */
01875 
01876       row_info->width = final_width;
01877       row_info->rowbytes = ((final_width *
01878          (png_uint_32)row_info->pixel_depth + 7) >> 3);
01879    }
01880 
01881 }
01882 
01883 #endif /* PNG_READ_INTERLACING_SUPPORTED */
01884 
01885 
01886 // These variables are utilized in the functions below.  They are declared
01887 // globally here to ensure alignment on 8-byte boundaries.
01888 
01889 union uAll {
01890    __int64 use;
01891    double  align;
01892 } LBCarryMask = {0x0101010101010101},
01893   HBClearMask = {0x7f7f7f7f7f7f7f7f},
01894   ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
01895 
01896 
01897 // Optimized code for PNG Average filter decoder
01898 void /* PRIVATE */
01899 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
01900                             , png_bytep prev_row)
01901 {
01902    int bpp;
01903    png_uint_32 FullLength;
01904    png_uint_32 MMXLength;
01905    //png_uint_32 len;
01906    int diff;
01907 
01908    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
01909    FullLength  = row_info->rowbytes; // # of bytes to filter
01910    _asm {
01911          // Init address pointers and offset
01912          mov edi, row          // edi ==> Avg(x)
01913          xor ebx, ebx          // ebx ==> x
01914          mov edx, edi
01915          mov esi, prev_row           // esi ==> Prior(x)
01916          sub edx, bpp          // edx ==> Raw(x-bpp)
01917 
01918          xor eax, eax
01919          // Compute the Raw value for the first bpp bytes
01920          //    Raw(x) = Avg(x) + (Prior(x)/2)
01921 davgrlp:
01922          mov al, [esi + ebx]   // Load al with Prior(x)
01923          inc ebx
01924          shr al, 1             // divide by 2
01925          add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
01926          cmp ebx, bpp
01927          mov [edi+ebx-1], al    // Write back Raw(x);
01928                             // mov does not affect flags; -1 to offset inc ebx
01929          jb davgrlp
01930          // get # of bytes to alignment
01931          mov diff, edi         // take start of row
01932          add diff, ebx         // add bpp
01933          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
01934          and diff, 0xfffffff8  // mask to alignment boundary
01935          sub diff, edi         // subtract from start ==> value ebx at alignment
01936          jz davggo
01937          // fix alignment
01938          // Compute the Raw value for the bytes upto the alignment boundary
01939          //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
01940          xor ecx, ecx
01941 davglp1:
01942          xor eax, eax
01943          mov cl, [esi + ebx]        // load cl with Prior(x)
01944          mov al, [edx + ebx]  // load al with Raw(x-bpp)
01945          add ax, cx
01946          inc ebx
01947          shr ax, 1            // divide by 2
01948          add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
01949          cmp ebx, diff              // Check if at alignment boundary
01950          mov [edi+ebx-1], al        // Write back Raw(x);
01951                             // mov does not affect flags; -1 to offset inc ebx
01952          jb davglp1               // Repeat until at alignment boundary
01953 davggo:
01954          mov eax, FullLength
01955          mov ecx, eax
01956          sub eax, ebx          // subtract alignment fix
01957          and eax, 0x00000007   // calc bytes over mult of 8
01958          sub ecx, eax          // drop over bytes from original length
01959          mov MMXLength, ecx
01960    } // end _asm block
01961    // Now do the math for the rest of the row
01962    switch ( bpp )
01963    {
01964       case 3:
01965       {
01966          ActiveMask.use  = 0x0000000000ffffff;
01967          ShiftBpp.use = 24;    // == 3 * 8
01968          ShiftRem.use = 40;    // == 64 - 24
01969          _asm {
01970             // Re-init address pointers and offset
01971             movq mm7, ActiveMask
01972             mov ebx, diff      // ebx ==> x = offset to alignment boundary
01973             movq mm5, LBCarryMask
01974             mov edi, row       // edi ==> Avg(x)
01975             movq mm4, HBClearMask
01976             mov esi, prev_row        // esi ==> Prior(x)
01977             // PRIME the pump (load the first Raw(x-bpp) data set
01978             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
01979                                // (we correct position in loop below)
01980 davg3lp:
01981             movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
01982             // Add (Prev_row/2) to Average
01983             movq mm3, mm5
01984             psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
01985             movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
01986             movq mm6, mm7
01987             pand mm3, mm1      // get lsb for each prev_row byte
01988             psrlq mm1, 1       // divide prev_row bytes by 2
01989             pand  mm1, mm4     // clear invalid bit 7 of each byte
01990             paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
01991             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
01992             movq mm1, mm3      // now use mm1 for getting LBCarrys
01993             pand mm1, mm2      // get LBCarrys for each byte where both
01994                                // lsb's were == 1 (Only valid for active group)
01995             psrlq mm2, 1       // divide raw bytes by 2
01996             pand  mm2, mm4     // clear invalid bit 7 of each byte
01997             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
01998             pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
01999             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
02000                                //  byte
02001             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
02002             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
02003             movq mm2, mm0        // mov updated Raws to mm2
02004             psllq mm2, ShiftBpp  // shift data to position correctly
02005             movq mm1, mm3        // now use mm1 for getting LBCarrys
02006             pand mm1, mm2      // get LBCarrys for each byte where both
02007                                // lsb's were == 1 (Only valid for active group)
02008             psrlq mm2, 1       // divide raw bytes by 2
02009             pand  mm2, mm4     // clear invalid bit 7 of each byte
02010             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
02011             pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
02012             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
02013                                //  byte
02014 
02015             // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
02016             psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
02017                                  // bytes
02018             movq mm2, mm0        // mov updated Raws to mm2
02019             psllq mm2, ShiftBpp  // shift data to position correctly
02020                               // Data only needs to be shifted once here to
02021                               // get the correct x-bpp offset.
02022             movq mm1, mm3     // now use mm1 for getting LBCarrys
02023             pand mm1, mm2     // get LBCarrys for each byte where both
02024                               // lsb's were == 1 (Only valid for active group)
02025             psrlq mm2, 1      // divide raw bytes by 2
02026             pand  mm2, mm4    // clear invalid bit 7 of each byte
02027             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
02028             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
02029             add ebx, 8
02030             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
02031                               // byte
02032 
02033             // Now ready to write back to memory
02034             movq [edi + ebx - 8], mm0
02035             // Move updated Raw(x) to use as Raw(x-bpp) for next loop
02036             cmp ebx, MMXLength
02037             movq mm2, mm0     // mov updated Raw(x) to mm2
02038             jb davg3lp
02039          } // end _asm block
02040       }
02041       break;
02042 
02043       case 6:
02044       case 4:
02045       case 7:
02046       case 5:
02047       {
02048          ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
02049                                                 // appropriate inactive bytes
02050          ShiftBpp.use = bpp << 3;
02051          ShiftRem.use = 64 - ShiftBpp.use;
02052          _asm {
02053             movq mm4, HBClearMask
02054             // Re-init address pointers and offset
02055             mov ebx, diff       // ebx ==> x = offset to alignment boundary
02056             // Load ActiveMask and clear all bytes except for 1st active group
02057             movq mm7, ActiveMask
02058             mov edi, row         // edi ==> Avg(x)
02059             psrlq mm7, ShiftRem
02060             mov esi, prev_row    // esi ==> Prior(x)
02061             movq mm6, mm7
02062             movq mm5, LBCarryMask
02063             psllq mm6, ShiftBpp  // Create mask for 2nd active group
02064             // PRIME the pump (load the first Raw(x-bpp) data set
02065             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
02066                                  // (we correct position in loop below)
02067 davg4lp:
02068             movq mm0, [edi + ebx]
02069             psrlq mm2, ShiftRem  // shift data to position correctly
02070             movq mm1, [esi + ebx]
02071             // Add (Prev_row/2) to Average
02072             movq mm3, mm5
02073             pand mm3, mm1     // get lsb for each prev_row byte
02074             psrlq mm1, 1      // divide prev_row bytes by 2
02075             pand  mm1, mm4    // clear invalid bit 7 of each byte
02076             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
02077             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
02078             movq mm1, mm3     // now use mm1 for getting LBCarrys
02079             pand mm1, mm2     // get LBCarrys for each byte where both
02080                               // lsb's were == 1 (Only valid for active group)
02081             psrlq mm2, 1      // divide raw bytes by 2
02082             pand  mm2, mm4    // clear invalid bit 7 of each byte
02083             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
02084             pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
02085             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
02086                               // byte
02087             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
02088             movq mm2, mm0     // mov updated Raws to mm2
02089             psllq mm2, ShiftBpp // shift data to position correctly
02090             add ebx, 8
02091             movq mm1, mm3     // now use mm1 for getting LBCarrys
02092             pand mm1, mm2     // get LBCarrys for each byte where both
02093                               // lsb's were == 1 (Only valid for active group)
02094             psrlq mm2, 1      // divide raw bytes by 2
02095             pand  mm2, mm4    // clear invalid bit 7 of each byte
02096             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
02097             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
02098             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
02099                               // byte
02100             cmp ebx, MMXLength
02101             // Now ready to write back to memory
02102             movq [edi + ebx - 8], mm0
02103             // Prep Raw(x-bpp) for next loop
02104             movq mm2, mm0     // mov updated Raws to mm2
02105             jb davg4lp
02106          } // end _asm block
02107       }
02108       break;
02109       case 2:
02110       {
02111          ActiveMask.use  = 0x000000000000ffff;
02112          ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
02113          ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
02114          _asm {
02115             // Load ActiveMask
02116             movq mm7, ActiveMask
02117             // Re-init address pointers and offset
02118             mov ebx, diff     // ebx ==> x = offset to alignment boundary
02119             movq mm5, LBCarryMask
02120             mov edi, row      // edi ==> Avg(x)
02121             movq mm4, HBClearMask
02122             mov esi, prev_row  // esi ==> Prior(x)
02123             // PRIME the pump (load the first Raw(x-bpp) data set
02124             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
02125                               // (we correct position in loop below)
02126 davg2lp:
02127             movq mm0, [edi + ebx]
02128             psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
02129             movq mm1, [esi + ebx]
02130             // Add (Prev_row/2) to Average
02131             movq mm3, mm5
02132             pand mm3, mm1     // get lsb for each prev_row byte
02133             psrlq mm1, 1      // divide prev_row bytes by 2
02134             pand  mm1, mm4    // clear invalid bit 7 of each byte
02135             movq mm6, mm7
02136             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
02137             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
02138             movq mm1, mm3     // now use mm1 for getting LBCarrys
02139             pand mm1, mm2     // get LBCarrys for each byte where both
02140                               // lsb's were == 1 (Only valid for active group)
02141             psrlq mm2, 1      // divide raw bytes by 2
02142             pand  mm2, mm4    // clear invalid bit 7 of each byte
02143             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
02144             pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
02145             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
02146             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
02147             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
02148             movq mm2, mm0       // mov updated Raws to mm2
02149             psllq mm2, ShiftBpp // shift data to position correctly
02150             movq mm1, mm3       // now use mm1 for getting LBCarrys
02151             pand mm1, mm2       // get LBCarrys for each byte where both
02152                                 // lsb's were == 1 (Only valid for active group)
02153             psrlq mm2, 1        // divide raw bytes by 2
02154             pand  mm2, mm4      // clear invalid bit 7 of each byte
02155             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
02156             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
02157             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
02158 
02159             // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
02160             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
02161             movq mm2, mm0       // mov updated Raws to mm2
02162             psllq mm2, ShiftBpp // shift data to position correctly
02163                                 // Data only needs to be shifted once here to
02164                                 // get the correct x-bpp offset.
02165             movq mm1, mm3       // now use mm1 for getting LBCarrys
02166             pand mm1, mm2       // get LBCarrys for each byte where both
02167                                 // lsb's were == 1 (Only valid for active group)
02168             psrlq mm2, 1        // divide raw bytes by 2
02169             pand  mm2, mm4      // clear invalid bit 7 of each byte
02170             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
02171             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
02172             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
02173 
02174             // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
02175             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
02176             movq mm2, mm0        // mov updated Raws to mm2
02177             psllq mm2, ShiftBpp  // shift data to position correctly
02178                                  // Data only needs to be shifted once here to
02179                                  // get the correct x-bpp offset.
02180             add ebx, 8
02181             movq mm1, mm3    // now use mm1 for getting LBCarrys
02182             pand mm1, mm2    // get LBCarrys for each byte where both
02183                              // lsb's were == 1 (Only valid for active group)
02184             psrlq mm2, 1     // divide raw bytes by 2
02185             pand  mm2, mm4   // clear invalid bit 7 of each byte
02186             paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
02187             pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
02188             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
02189 
02190             cmp ebx, MMXLength
02191             // Now ready to write back to memory
02192             movq [edi + ebx - 8], mm0
02193             // Prep Raw(x-bpp) for next loop
02194             movq mm2, mm0    // mov updated Raws to mm2
02195             jb davg2lp
02196         } // end _asm block
02197       }
02198       break;
02199 
02200       case 1:                 // bpp == 1
02201       {
02202          _asm {
02203             // Re-init address pointers and offset
02204             mov ebx, diff     // ebx ==> x = offset to alignment boundary
02205             mov edi, row      // edi ==> Avg(x)
02206             cmp ebx, FullLength  // Test if offset at end of array
02207             jnb davg1end
02208             // Do Paeth decode for remaining bytes
02209             mov esi, prev_row    // esi ==> Prior(x)
02210             mov edx, edi
02211             xor ecx, ecx         // zero ecx before using cl & cx in loop below
02212             sub edx, bpp         // edx ==> Raw(x-bpp)
02213 davg1lp:
02214             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
02215             xor eax, eax
02216             mov cl, [esi + ebx]  // load cl with Prior(x)
02217             mov al, [edx + ebx]  // load al with Raw(x-bpp)
02218             add ax, cx
02219             inc ebx
02220             shr ax, 1            // divide by 2
02221             add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
02222             cmp ebx, FullLength  // Check if at end of array
02223             mov [edi+ebx-1], al  // Write back Raw(x);
02224                          // mov does not affect flags; -1 to offset inc ebx
02225             jb davg1lp
02226 davg1end:
02227          } // end _asm block
02228       }
02229       return;
02230 
02231       case 8:             // bpp == 8
02232       {
02233          _asm {
02234             // Re-init address pointers and offset
02235             mov ebx, diff           // ebx ==> x = offset to alignment boundary
02236             movq mm5, LBCarryMask
02237             mov edi, row            // edi ==> Avg(x)
02238             movq mm4, HBClearMask
02239             mov esi, prev_row       // esi ==> Prior(x)
02240             // PRIME the pump (load the first Raw(x-bpp) data set
02241             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
02242                                 // (NO NEED to correct position in loop below)
02243 davg8lp:
02244             movq mm0, [edi + ebx]
02245             movq mm3, mm5
02246             movq mm1, [esi + ebx]
02247             add ebx, 8
02248             pand mm3, mm1       // get lsb for each prev_row byte
02249             psrlq mm1, 1        // divide prev_row bytes by 2
02250             pand mm3, mm2       // get LBCarrys for each byte where both
02251                                 // lsb's were == 1
02252             psrlq mm2, 1        // divide raw bytes by 2
02253             pand  mm1, mm4      // clear invalid bit 7 of each byte
02254             paddb mm0, mm3      // add LBCarrys to Avg for each byte
02255             pand  mm2, mm4      // clear invalid bit 7 of each byte
02256             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
02257             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
02258             cmp ebx, MMXLength
02259             movq [edi + ebx - 8], mm0
02260             movq mm2, mm0       // reuse as Raw(x-bpp)
02261             jb davg8lp
02262         } // end _asm block
02263       }
02264       break;
02265       default:                  // bpp greater than 8
02266       {
02267         _asm {
02268             movq mm5, LBCarryMask
02269             // Re-init address pointers and offset
02270             mov ebx, diff       // ebx ==> x = offset to alignment boundary
02271             mov edi, row        // edi ==> Avg(x)
02272             movq mm4, HBClearMask
02273             mov edx, edi
02274             mov esi, prev_row   // esi ==> Prior(x)
02275             sub edx, bpp        // edx ==> Raw(x-bpp)
02276 davgAlp:
02277             movq mm0, [edi + ebx]
02278             movq mm3, mm5
02279             movq mm1, [esi + ebx]
02280             pand mm3, mm1       // get lsb for each prev_row byte
02281             movq mm2, [edx + ebx]
02282             psrlq mm1, 1        // divide prev_row bytes by 2
02283             pand mm3, mm2       // get LBCarrys for each byte where both
02284                                 // lsb's were == 1
02285             psrlq mm2, 1        // divide raw bytes by 2
02286             pand  mm1, mm4      // clear invalid bit 7 of each byte
02287             paddb mm0, mm3      // add LBCarrys to Avg for each byte
02288             pand  mm2, mm4      // clear invalid bit 7 of each byte
02289             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
02290             add ebx, 8
02291             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
02292             cmp ebx, MMXLength
02293             movq [edi + ebx - 8], mm0
02294             jb davgAlp
02295         } // end _asm block
02296       }
02297       break;
02298    }                         // end switch ( bpp )
02299 
02300    _asm {
02301          // MMX acceleration complete now do clean-up
02302          // Check if any remaining bytes left to decode
02303          mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
02304          mov edi, row          // edi ==> Avg(x)
02305          cmp ebx, FullLength   // Test if offset at end of array
02306          jnb davgend
02307          // Do Paeth decode for remaining bytes
02308          mov esi, prev_row     // esi ==> Prior(x)
02309          mov edx, edi
02310          xor ecx, ecx          // zero ecx before using cl & cx in loop below
02311          sub edx, bpp          // edx ==> Raw(x-bpp)
02312 davglp2:
02313          // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
02314          xor eax, eax
02315          mov cl, [esi + ebx]   // load cl with Prior(x)
02316          mov al, [edx + ebx]   // load al with Raw(x-bpp)
02317          add ax, cx
02318          inc ebx
02319          shr ax, 1              // divide by 2
02320          add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
02321          cmp ebx, FullLength    // Check if at end of array
02322          mov [edi+ebx-1], al    // Write back Raw(x);
02323                           // mov does not affect flags; -1 to offset inc ebx
02324          jb davglp2
02325 davgend:
02326          emms             // End MMX instructions; prep for possible FP instrs.
02327    } // end _asm block
02328 }
02329 
02330 // Optimized code for PNG Paeth filter decoder
02331 void /* PRIVATE */
02332 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
02333                               png_bytep prev_row)
02334 {
02335    png_uint_32 FullLength;
02336    png_uint_32 MMXLength;
02337    //png_uint_32 len;
02338    int bpp;
02339    int diff;
02340    //int ptemp;
02341    int patemp, pbtemp, pctemp;
02342 
02343    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
02344    FullLength  = row_info->rowbytes; // # of bytes to filter
02345    _asm
02346    {
02347          xor ebx, ebx        // ebx ==> x offset
02348          mov edi, row
02349          xor edx, edx        // edx ==> x-bpp offset
02350          mov esi, prev_row
02351          xor eax, eax
02352 
02353          // Compute the Raw value for the first bpp bytes
02354          // Note: the formula works out to be always
02355          //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
02356 dpthrlp:
02357          mov al, [edi + ebx]
02358          add al, [esi + ebx]
02359          inc ebx
02360          cmp ebx, bpp
02361          mov [edi + ebx - 1], al
02362          jb dpthrlp
02363          // get # of bytes to alignment
02364          mov diff, edi         // take start of row
02365          add diff, ebx         // add bpp
02366          xor ecx, ecx
02367          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
02368          and diff, 0xfffffff8  // mask to alignment boundary
02369          sub diff, edi         // subtract from start ==> value ebx at alignment
02370          jz dpthgo
02371          // fix alignment
02372 dpthlp1:
02373          xor eax, eax
02374          // pav = p - a = (a + b - c) - a = b - c
02375          mov al, [esi + ebx]   // load Prior(x) into al
02376          mov cl, [esi + edx]   // load Prior(x-bpp) into cl
02377          sub eax, ecx          // subtract Prior(x-bpp)
02378          mov patemp, eax       // Save pav for later use
02379          xor eax, eax
02380          // pbv = p - b = (a + b - c) - b = a - c
02381          mov al, [edi + edx]   // load Raw(x-bpp) into al
02382          sub eax, ecx          // subtract Prior(x-bpp)
02383          mov ecx, eax
02384          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02385          add eax, patemp       // pcv = pav + pbv
02386          // pc = abs(pcv)
02387          test eax, 0x80000000
02388          jz dpthpca
02389          neg eax               // reverse sign of neg values
02390 dpthpca:
02391          mov pctemp, eax       // save pc for later use
02392          // pb = abs(pbv)
02393          test ecx, 0x80000000
02394          jz dpthpba
02395          neg ecx               // reverse sign of neg values
02396 dpthpba:
02397          mov pbtemp, ecx       // save pb for later use
02398          // pa = abs(pav)
02399          mov eax, patemp
02400          test eax, 0x80000000
02401          jz dpthpaa
02402          neg eax               // reverse sign of neg values
02403 dpthpaa:
02404          mov patemp, eax       // save pa for later use
02405          // test if pa <= pb
02406          cmp eax, ecx
02407          jna dpthabb
02408          // pa > pb; now test if pb <= pc
02409          cmp ecx, pctemp
02410          jna dpthbbc
02411          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
02412          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
02413          jmp dpthpaeth
02414 dpthbbc:
02415          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
02416          mov cl, [esi + ebx]   // load Prior(x) into cl
02417          jmp dpthpaeth
02418 dpthabb:
02419          // pa <= pb; now test if pa <= pc
02420          cmp eax, pctemp
02421          jna dpthabc
02422          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
02423          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
02424          jmp dpthpaeth
02425 dpthabc:
02426          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
02427          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
02428 dpthpaeth:
02429          inc ebx
02430          inc edx
02431          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
02432          add [edi + ebx - 1], cl
02433          cmp ebx, diff
02434          jb dpthlp1
02435 dpthgo:
02436          mov ecx, FullLength
02437          mov eax, ecx
02438          sub eax, ebx          // subtract alignment fix
02439          and eax, 0x00000007   // calc bytes over mult of 8
02440          sub ecx, eax          // drop over bytes from original length
02441          mov MMXLength, ecx
02442    } // end _asm block
02443    // Now do the math for the rest of the row
02444    switch ( bpp )
02445    {
02446       case 3:
02447       {
02448          ActiveMask.use = 0x0000000000ffffff;
02449          ActiveMaskEnd.use = 0xffff000000000000;
02450          ShiftBpp.use = 24;    // == bpp(3) * 8
02451          ShiftRem.use = 40;    // == 64 - 24
02452          _asm
02453          {
02454             mov ebx, diff
02455             mov edi, row
02456             mov esi, prev_row
02457             pxor mm0, mm0
02458             // PRIME the pump (load the first Raw(x-bpp) data set
02459             movq mm1, [edi+ebx-8]
02460 dpth3lp:
02461             psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
02462             movq mm2, [esi + ebx]   // load b=Prior(x)
02463             punpcklbw mm1, mm0      // Unpack High bytes of a
02464             movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
02465             punpcklbw mm2, mm0      // Unpack High bytes of b
02466             psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
02467             // pav = p - a = (a + b - c) - a = b - c
02468             movq mm4, mm2
02469             punpcklbw mm3, mm0      // Unpack High bytes of c
02470             // pbv = p - b = (a + b - c) - b = a - c
02471             movq mm5, mm1
02472             psubw mm4, mm3
02473             pxor mm7, mm7
02474             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02475             movq mm6, mm4
02476             psubw mm5, mm3
02477 
02478             // pa = abs(p-a) = abs(pav)
02479             // pb = abs(p-b) = abs(pbv)
02480             // pc = abs(p-c) = abs(pcv)
02481             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
02482             paddw mm6, mm5
02483             pand mm0, mm4       // Only pav bytes < 0 in mm7
02484             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
02485             psubw mm4, mm0
02486             pand mm7, mm5       // Only pbv bytes < 0 in mm0
02487             psubw mm4, mm0
02488             psubw mm5, mm7
02489             pxor mm0, mm0
02490             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
02491             pand mm0, mm6       // Only pav bytes < 0 in mm7
02492             psubw mm5, mm7
02493             psubw mm6, mm0
02494             //  test pa <= pb
02495             movq mm7, mm4
02496             psubw mm6, mm0
02497             pcmpgtw mm7, mm5    // pa > pb?
02498             movq mm0, mm7
02499             // use mm7 mask to merge pa & pb
02500             pand mm5, mm7
02501             // use mm0 mask copy to merge a & b
02502             pand mm2, mm0
02503             pandn mm7, mm4
02504             pandn mm0, mm1
02505             paddw mm7, mm5
02506             paddw mm0, mm2
02507             //  test  ((pa <= pb)? pa:pb) <= pc
02508             pcmpgtw mm7, mm6       // pab > pc?
02509             pxor mm1, mm1
02510             pand mm3, mm7
02511             pandn mm7, mm0
02512             paddw mm7, mm3
02513             pxor mm0, mm0
02514             packuswb mm7, mm1
02515             movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
02516             pand mm7, ActiveMask
02517             movq mm2, mm3           // load b=Prior(x) step 1
02518             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
02519             punpcklbw mm3, mm0      // Unpack High bytes of c
02520             movq [edi + ebx], mm7   // write back updated value
02521             movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
02522             // Now do Paeth for 2nd set of bytes (3-5)
02523             psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
02524             punpcklbw mm1, mm0      // Unpack High bytes of a
02525             pxor mm7, mm7
02526             punpcklbw mm2, mm0      // Unpack High bytes of b
02527             // pbv = p - b = (a + b - c) - b = a - c
02528             movq mm5, mm1
02529             // pav = p - a = (a + b - c) - a = b - c
02530             movq mm4, mm2
02531             psubw mm5, mm3
02532             psubw mm4, mm3
02533             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
02534             //       pav + pbv = pbv + pav
02535             movq mm6, mm5
02536             paddw mm6, mm4
02537 
02538             // pa = abs(p-a) = abs(pav)
02539             // pb = abs(p-b) = abs(pbv)
02540             // pc = abs(p-c) = abs(pcv)
02541             pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
02542             pcmpgtw mm7, mm4       // Create mask pav bytes < 0
02543             pand mm0, mm5          // Only pbv bytes < 0 in mm0
02544             pand mm7, mm4          // Only pav bytes < 0 in mm7
02545             psubw mm5, mm0
02546             psubw mm4, mm7
02547             psubw mm5, mm0
02548             psubw mm4, mm7
02549             pxor mm0, mm0
02550             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
02551             pand mm0, mm6          // Only pav bytes < 0 in mm7
02552             psubw mm6, mm0
02553             //  test pa <= pb
02554             movq mm7, mm4
02555             psubw mm6, mm0
02556             pcmpgtw mm7, mm5       // pa > pb?
02557             movq mm0, mm7
02558             // use mm7 mask to merge pa & pb
02559             pand mm5, mm7
02560             // use mm0 mask copy to merge a & b
02561             pand mm2, mm0
02562             pandn mm7, mm4
02563             pandn mm0, mm1
02564             paddw mm7, mm5
02565             paddw mm0, mm2
02566             //  test  ((pa <= pb)? pa:pb) <= pc
02567             pcmpgtw mm7, mm6       // pab > pc?
02568             movq mm2, [esi + ebx]  // load b=Prior(x)
02569             pand mm3, mm7
02570             pandn mm7, mm0
02571             pxor mm1, mm1
02572             paddw mm7, mm3
02573             pxor mm0, mm0
02574             packuswb mm7, mm1
02575             movq mm3, mm2           // load c=Prior(x-bpp) step 1
02576             pand mm7, ActiveMask
02577             punpckhbw mm2, mm0      // Unpack High bytes of b
02578             psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
02579              // pav = p - a = (a + b - c) - a = b - c
02580             movq mm4, mm2
02581             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
02582             psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
02583             movq [edi + ebx], mm7   // write back updated value
02584             movq mm1, mm7
02585             punpckhbw mm3, mm0      // Unpack High bytes of c
02586             psllq mm1, ShiftBpp     // Shift bytes
02587                                     // Now mm1 will be used as Raw(x-bpp)
02588             // Now do Paeth for 3rd, and final, set of bytes (6-7)
02589             pxor mm7, mm7
02590             punpckhbw mm1, mm0      // Unpack High bytes of a
02591             psubw mm4, mm3
02592             // pbv = p - b = (a + b - c) - b = a - c
02593             movq mm5, mm1
02594             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02595             movq mm6, mm4
02596             psubw mm5, mm3
02597             pxor mm0, mm0
02598             paddw mm6, mm5
02599 
02600             // pa = abs(p-a) = abs(pav)
02601             // pb = abs(p-b) = abs(pbv)
02602             // pc = abs(p-c) = abs(pcv)
02603             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
02604             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
02605             pand mm0, mm4       // Only pav bytes < 0 in mm7
02606             pand mm7, mm5       // Only pbv bytes < 0 in mm0
02607             psubw mm4, mm0
02608             psubw mm5, mm7
02609             psubw mm4, mm0
02610             psubw mm5, mm7
02611             pxor mm0, mm0
02612             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
02613             pand mm0, mm6       // Only pav bytes < 0 in mm7
02614             psubw mm6, mm0
02615             //  test pa <= pb
02616             movq mm7, mm4
02617             psubw mm6, mm0
02618             pcmpgtw mm7, mm5    // pa > pb?
02619             movq mm0, mm7
02620             // use mm0 mask copy to merge a & b
02621             pand mm2, mm0
02622             // use mm7 mask to merge pa & pb
02623             pand mm5, mm7
02624             pandn mm0, mm1
02625             pandn mm7, mm4
02626             paddw mm0, mm2
02627             paddw mm7, mm5
02628             //  test  ((pa <= pb)? pa:pb) <= pc
02629             pcmpgtw mm7, mm6    // pab > pc?
02630             pand mm3, mm7
02631             pandn mm7, mm0
02632             paddw mm7, mm3
02633             pxor mm1, mm1
02634             packuswb mm1, mm7
02635             // Step ebx to next set of 8 bytes and repeat loop til done
02636             add ebx, 8
02637             pand mm1, ActiveMaskEnd
02638             paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
02639 
02640             cmp ebx, MMXLength
02641             pxor mm0, mm0              // pxor does not affect flags
02642             movq [edi + ebx - 8], mm1  // write back updated value
02643                                  // mm1 will be used as Raw(x-bpp) next loop
02644                            // mm3 ready to be used as Prior(x-bpp) next loop
02645             jb dpth3lp
02646          } // end _asm block
02647       }
02648       break;
02649 
02650       case 6:
02651       case 7:
02652       case 5:
02653       {
02654          ActiveMask.use  = 0x00000000ffffffff;
02655          ActiveMask2.use = 0xffffffff00000000;
02656          ShiftBpp.use = bpp << 3;    // == bpp * 8
02657          ShiftRem.use = 64 - ShiftBpp.use;
02658          _asm
02659          {
02660             mov ebx, diff
02661             mov edi, row
02662             mov esi, prev_row
02663             // PRIME the pump (load the first Raw(x-bpp) data set
02664             movq mm1, [edi+ebx-8]
02665             pxor mm0, mm0
02666 dpth6lp:
02667             // Must shift to position Raw(x-bpp) data
02668             psrlq mm1, ShiftRem
02669             // Do first set of 4 bytes
02670             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
02671             punpcklbw mm1, mm0      // Unpack Low bytes of a
02672             movq mm2, [esi + ebx]   // load b=Prior(x)
02673             punpcklbw mm2, mm0      // Unpack Low bytes of b
02674             // Must shift to position Prior(x-bpp) data
02675             psrlq mm3, ShiftRem
02676             // pav = p - a = (a + b - c) - a = b - c
02677             movq mm4, mm2
02678             punpcklbw mm3, mm0      // Unpack Low bytes of c
02679             // pbv = p - b = (a + b - c) - b = a - c
02680             movq mm5, mm1
02681             psubw mm4, mm3
02682             pxor mm7, mm7
02683             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02684             movq mm6, mm4
02685             psubw mm5, mm3
02686             // pa = abs(p-a) = abs(pav)
02687             // pb = abs(p-b) = abs(pbv)
02688             // pc = abs(p-c) = abs(pcv)
02689             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
02690             paddw mm6, mm5
02691             pand mm0, mm4       // Only pav bytes < 0 in mm7
02692             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
02693             psubw mm4, mm0
02694             pand mm7, mm5       // Only pbv bytes < 0 in mm0
02695             psubw mm4, mm0
02696             psubw mm5, mm7
02697             pxor mm0, mm0
02698             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
02699             pand mm0, mm6       // Only pav bytes < 0 in mm7
02700             psubw mm5, mm7
02701             psubw mm6, mm0
02702             //  test pa <= pb
02703             movq mm7, mm4
02704             psubw mm6, mm0
02705             pcmpgtw mm7, mm5    // pa > pb?
02706             movq mm0, mm7
02707             // use mm7 mask to merge pa & pb
02708             pand mm5, mm7
02709             // use mm0 mask copy to merge a & b
02710             pand mm2, mm0
02711             pandn mm7, mm4
02712             pandn mm0, mm1
02713             paddw mm7, mm5
02714             paddw mm0, mm2
02715             //  test  ((pa <= pb)? pa:pb) <= pc
02716             pcmpgtw mm7, mm6    // pab > pc?
02717             pxor mm1, mm1
02718             pand mm3, mm7
02719             pandn mm7, mm0
02720             paddw mm7, mm3
02721             pxor mm0, mm0
02722             packuswb mm7, mm1
02723             movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
02724             pand mm7, ActiveMask
02725             psrlq mm3, ShiftRem
02726             movq mm2, [esi + ebx]      // load b=Prior(x) step 1
02727             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
02728             movq mm6, mm2
02729             movq [edi + ebx], mm7      // write back updated value
02730             movq mm1, [edi+ebx-8]
02731             psllq mm6, ShiftBpp
02732             movq mm5, mm7
02733             psrlq mm1, ShiftRem
02734             por mm3, mm6
02735             psllq mm5, ShiftBpp
02736             punpckhbw mm3, mm0         // Unpack High bytes of c
02737             por mm1, mm5
02738             // Do second set of 4 bytes
02739             punpckhbw mm2, mm0         // Unpack High bytes of b
02740             punpckhbw mm1, mm0         // Unpack High bytes of a
02741             // pav = p - a = (a + b - c) - a = b - c
02742             movq mm4, mm2
02743             // pbv = p - b = (a + b - c) - b = a - c
02744             movq mm5, mm1
02745             psubw mm4, mm3
02746             pxor mm7, mm7
02747             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02748             movq mm6, mm4
02749             psubw mm5, mm3
02750             // pa = abs(p-a) = abs(pav)
02751             // pb = abs(p-b) = abs(pbv)
02752             // pc = abs(p-c) = abs(pcv)
02753             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
02754             paddw mm6, mm5
02755             pand mm0, mm4          // Only pav bytes < 0 in mm7
02756             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
02757             psubw mm4, mm0
02758             pand mm7, mm5          // Only pbv bytes < 0 in mm0
02759             psubw mm4, mm0
02760             psubw mm5, mm7
02761             pxor mm0, mm0
02762             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
02763             pand mm0, mm6          // Only pav bytes < 0 in mm7
02764             psubw mm5, mm7
02765             psubw mm6, mm0
02766             //  test pa <= pb
02767             movq mm7, mm4
02768             psubw mm6, mm0
02769             pcmpgtw mm7, mm5       // pa > pb?
02770             movq mm0, mm7
02771             // use mm7 mask to merge pa & pb
02772             pand mm5, mm7
02773             // use mm0 mask copy to merge a & b
02774             pand mm2, mm0
02775             pandn mm7, mm4
02776             pandn mm0, mm1
02777             paddw mm7, mm5
02778             paddw mm0, mm2
02779             //  test  ((pa <= pb)? pa:pb) <= pc
02780             pcmpgtw mm7, mm6           // pab > pc?
02781             pxor mm1, mm1
02782             pand mm3, mm7
02783             pandn mm7, mm0
02784             pxor mm1, mm1
02785             paddw mm7, mm3
02786             pxor mm0, mm0
02787             // Step ex to next set of 8 bytes and repeat loop til done
02788             add ebx, 8
02789             packuswb mm1, mm7
02790             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
02791             cmp ebx, MMXLength
02792             movq [edi + ebx - 8], mm1      // write back updated value
02793                                 // mm1 will be used as Raw(x-bpp) next loop
02794             jb dpth6lp
02795          } // end _asm block
02796       }
02797       break;
02798 
02799       case 4:
02800       {
02801          ActiveMask.use  = 0x00000000ffffffff;
02802          _asm {
02803             mov ebx, diff
02804             mov edi, row
02805             mov esi, prev_row
02806             pxor mm0, mm0
02807             // PRIME the pump (load the first Raw(x-bpp) data set
02808             movq mm1, [edi+ebx-8]    // Only time should need to read
02809                                      //  a=Raw(x-bpp) bytes
02810 dpth4lp:
02811             // Do first set of 4 bytes
02812             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
02813             punpckhbw mm1, mm0       // Unpack Low bytes of a
02814             movq mm2, [esi + ebx]    // load b=Prior(x)
02815             punpcklbw mm2, mm0       // Unpack High bytes of b
02816             // pav = p - a = (a + b - c) - a = b - c
02817             movq mm4, mm2
02818             punpckhbw mm3, mm0       // Unpack High bytes of c
02819             // pbv = p - b = (a + b - c) - b = a - c
02820             movq mm5, mm1
02821             psubw mm4, mm3
02822             pxor mm7, mm7
02823             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02824             movq mm6, mm4
02825             psubw mm5, mm3
02826             // pa = abs(p-a) = abs(pav)
02827             // pb = abs(p-b) = abs(pbv)
02828             // pc = abs(p-c) = abs(pcv)
02829             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
02830             paddw mm6, mm5
02831             pand mm0, mm4          // Only pav bytes < 0 in mm7
02832             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
02833             psubw mm4, mm0
02834             pand mm7, mm5          // Only pbv bytes < 0 in mm0
02835             psubw mm4, mm0
02836             psubw mm5, mm7
02837             pxor mm0, mm0
02838             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
02839             pand mm0, mm6          // Only pav bytes < 0 in mm7
02840             psubw mm5, mm7
02841             psubw mm6, mm0
02842             //  test pa <= pb
02843             movq mm7, mm4
02844             psubw mm6, mm0
02845             pcmpgtw mm7, mm5       // pa > pb?
02846             movq mm0, mm7
02847             // use mm7 mask to merge pa & pb
02848             pand mm5, mm7
02849             // use mm0 mask copy to merge a & b
02850             pand mm2, mm0
02851             pandn mm7, mm4
02852             pandn mm0, mm1
02853             paddw mm7, mm5
02854             paddw mm0, mm2
02855             //  test  ((pa <= pb)? pa:pb) <= pc
02856             pcmpgtw mm7, mm6       // pab > pc?
02857             pxor mm1, mm1
02858             pand mm3, mm7
02859             pandn mm7, mm0
02860             paddw mm7, mm3
02861             pxor mm0, mm0
02862             packuswb mm7, mm1
02863             movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
02864             pand mm7, ActiveMask
02865             movq mm2, mm3              // load b=Prior(x) step 1
02866             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
02867             punpcklbw mm3, mm0         // Unpack High bytes of c
02868             movq [edi + ebx], mm7      // write back updated value
02869             movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
02870             // Do second set of 4 bytes
02871             punpckhbw mm2, mm0         // Unpack Low bytes of b
02872             punpcklbw mm1, mm0         // Unpack Low bytes of a
02873             // pav = p - a = (a + b - c) - a = b - c
02874             movq mm4, mm2
02875             // pbv = p - b = (a + b - c) - b = a - c
02876             movq mm5, mm1
02877             psubw mm4, mm3
02878             pxor mm7, mm7
02879             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02880             movq mm6, mm4
02881             psubw mm5, mm3
02882             // pa = abs(p-a) = abs(pav)
02883             // pb = abs(p-b) = abs(pbv)
02884             // pc = abs(p-c) = abs(pcv)
02885             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
02886             paddw mm6, mm5
02887             pand mm0, mm4          // Only pav bytes < 0 in mm7
02888             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
02889             psubw mm4, mm0
02890             pand mm7, mm5          // Only pbv bytes < 0 in mm0
02891             psubw mm4, mm0
02892             psubw mm5, mm7
02893             pxor mm0, mm0
02894             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
02895             pand mm0, mm6          // Only pav bytes < 0 in mm7
02896             psubw mm5, mm7
02897             psubw mm6, mm0
02898             //  test pa <= pb
02899             movq mm7, mm4
02900             psubw mm6, mm0
02901             pcmpgtw mm7, mm5       // pa > pb?
02902             movq mm0, mm7
02903             // use mm7 mask to merge pa & pb
02904             pand mm5, mm7
02905             // use mm0 mask copy to merge a & b
02906             pand mm2, mm0
02907             pandn mm7, mm4
02908             pandn mm0, mm1
02909             paddw mm7, mm5
02910             paddw mm0, mm2
02911             //  test  ((pa <= pb)? pa:pb) <= pc
02912             pcmpgtw mm7, mm6       // pab > pc?
02913             pxor mm1, mm1
02914             pand mm3, mm7
02915             pandn mm7, mm0
02916             pxor mm1, mm1
02917             paddw mm7, mm3
02918             pxor mm0, mm0
02919             // Step ex to next set of 8 bytes and repeat loop til done
02920             add ebx, 8
02921             packuswb mm1, mm7
02922             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
02923             cmp ebx, MMXLength
02924             movq [edi + ebx - 8], mm1      // write back updated value
02925                                 // mm1 will be used as Raw(x-bpp) next loop
02926             jb dpth4lp
02927          } // end _asm block
02928       }
02929       break;
02930       case 8:                          // bpp == 8
02931       {
02932          ActiveMask.use  = 0x00000000ffffffff;
02933          _asm {
02934             mov ebx, diff
02935             mov edi, row
02936             mov esi, prev_row
02937             pxor mm0, mm0
02938             // PRIME the pump (load the first Raw(x-bpp) data set
02939             movq mm1, [edi+ebx-8]      // Only time should need to read
02940                                        //  a=Raw(x-bpp) bytes
02941 dpth8lp:
02942             // Do first set of 4 bytes
02943             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
02944             punpcklbw mm1, mm0         // Unpack Low bytes of a
02945             movq mm2, [esi + ebx]      // load b=Prior(x)
02946             punpcklbw mm2, mm0         // Unpack Low bytes of b
02947             // pav = p - a = (a + b - c) - a = b - c
02948             movq mm4, mm2
02949             punpcklbw mm3, mm0         // Unpack Low bytes of c
02950             // pbv = p - b = (a + b - c) - b = a - c
02951             movq mm5, mm1
02952             psubw mm4, mm3
02953             pxor mm7, mm7
02954             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
02955             movq mm6, mm4
02956             psubw mm5, mm3
02957             // pa = abs(p-a) = abs(pav)
02958             // pb = abs(p-b) = abs(pbv)
02959             // pc = abs(p-c) = abs(pcv)
02960             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
02961             paddw mm6, mm5
02962             pand mm0, mm4          // Only pav bytes < 0 in mm7
02963             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
02964             psubw mm4, mm0
02965             pand mm7, mm5          // Only pbv bytes < 0 in mm0
02966             psubw mm4, mm0
02967             psubw mm5, mm7
02968             pxor mm0, mm0
02969             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
02970             pand mm0, mm6          // Only pav bytes < 0 in mm7
02971             psubw mm5, mm7
02972             psubw mm6, mm0
02973             //  test pa <= pb
02974             movq mm7, mm4
02975             psubw mm6, mm0
02976             pcmpgtw mm7, mm5       // pa > pb?
02977             movq mm0, mm7
02978             // use mm7 mask to merge pa & pb
02979             pand mm5, mm7
02980             // use mm0 mask copy to merge a & b
02981             pand mm2, mm0
02982             pandn mm7, mm4
02983             pandn mm0, mm1
02984             paddw mm7, mm5
02985             paddw mm0, mm2
02986             //  test  ((pa <= pb)? pa:pb) <= pc
02987             pcmpgtw mm7, mm6       // pab > pc?
02988             pxor mm1, mm1
02989             pand mm3, mm7
02990             pandn mm7, mm0
02991             paddw mm7, mm3
02992             pxor mm0, mm0
02993             packuswb mm7, mm1
02994             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
02995             pand mm7, ActiveMask
02996             movq mm2, [esi + ebx]    // load b=Prior(x)
02997             paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
02998             punpckhbw mm3, mm0       // Unpack High bytes of c
02999             movq [edi + ebx], mm7    // write back updated value
03000             movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
03001 
03002             // Do second set of 4 bytes
03003             punpckhbw mm2, mm0       // Unpack High bytes of b
03004             punpckhbw mm1, mm0       // Unpack High bytes of a
03005             // pav = p - a = (a + b - c) - a = b - c
03006             movq mm4, mm2
03007             // pbv = p - b = (a + b - c) - b = a - c
03008             movq mm5, mm1
03009             psubw mm4, mm3
03010             pxor mm7, mm7
03011             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03012             movq mm6, mm4
03013             psubw mm5, mm3
03014             // pa = abs(p-a) = abs(pav)
03015             // pb = abs(p-b) = abs(pbv)
03016             // pc = abs(p-c) = abs(pcv)
03017             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
03018             paddw mm6, mm5
03019             pand mm0, mm4          // Only pav bytes < 0 in mm7
03020             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
03021             psubw mm4, mm0
03022             pand mm7, mm5          // Only pbv bytes < 0 in mm0
03023             psubw mm4, mm0
03024             psubw mm5, mm7
03025             pxor mm0, mm0
03026             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
03027             pand mm0, mm6          // Only pav bytes < 0 in mm7
03028             psubw mm5, mm7
03029             psubw mm6, mm0
03030             //  test pa <= pb
03031             movq mm7, mm4
03032             psubw mm6, mm0
03033             pcmpgtw mm7, mm5       // pa > pb?
03034             movq mm0, mm7
03035             // use mm7 mask to merge pa & pb
03036             pand mm5, mm7
03037             // use mm0 mask copy to merge a & b
03038             pand mm2, mm0
03039             pandn mm7, mm4
03040             pandn mm0, mm1
03041             paddw mm7, mm5
03042             paddw mm0, mm2
03043             //  test  ((pa <= pb)? pa:pb) <= pc
03044             pcmpgtw mm7, mm6       // pab > pc?
03045             pxor mm1, mm1
03046             pand mm3, mm7
03047             pandn mm7, mm0
03048             pxor mm1, mm1
03049             paddw mm7, mm3
03050             pxor mm0, mm0
03051             // Step ex to next set of 8 bytes and repeat loop til done
03052             add ebx, 8
03053             packuswb mm1, mm7
03054             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
03055             cmp ebx, MMXLength
03056             movq [edi + ebx - 8], mm1      // write back updated value
03057                             // mm1 will be used as Raw(x-bpp) next loop
03058             jb dpth8lp
03059          } // end _asm block
03060       }
03061       break;
03062 
03063       case 1:                // bpp = 1
03064       case 2:                // bpp = 2
03065       default:               // bpp > 8
03066       {
03067          _asm {
03068             mov ebx, diff
03069             cmp ebx, FullLength
03070             jnb dpthdend
03071             mov edi, row
03072             mov esi, prev_row
03073             // Do Paeth decode for remaining bytes
03074             mov edx, ebx
03075             xor ecx, ecx        // zero ecx before using cl & cx in loop below
03076             sub edx, bpp        // Set edx = ebx - bpp
03077 dpthdlp:
03078             xor eax, eax
03079             // pav = p - a = (a + b - c) - a = b - c
03080             mov al, [esi + ebx]        // load Prior(x) into al
03081             mov cl, [esi + edx]        // load Prior(x-bpp) into cl
03082             sub eax, ecx                 // subtract Prior(x-bpp)
03083             mov patemp, eax                 // Save pav for later use
03084             xor eax, eax
03085             // pbv = p - b = (a + b - c) - b = a - c
03086             mov al, [edi + edx]        // load Raw(x-bpp) into al
03087             sub eax, ecx                 // subtract Prior(x-bpp)
03088             mov ecx, eax
03089             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03090             add eax, patemp                 // pcv = pav + pbv
03091             // pc = abs(pcv)
03092             test eax, 0x80000000
03093             jz dpthdpca
03094             neg eax                     // reverse sign of neg values
03095 dpthdpca:
03096             mov pctemp, eax             // save pc for later use
03097             // pb = abs(pbv)
03098             test ecx, 0x80000000
03099             jz dpthdpba
03100             neg ecx                     // reverse sign of neg values
03101 dpthdpba:
03102             mov pbtemp, ecx             // save pb for later use
03103             // pa = abs(pav)
03104             mov eax, patemp
03105             test eax, 0x80000000
03106             jz dpthdpaa
03107             neg eax                     // reverse sign of neg values
03108 dpthdpaa:
03109             mov patemp, eax             // save pa for later use
03110             // test if pa <= pb
03111             cmp eax, ecx
03112             jna dpthdabb
03113             // pa > pb; now test if pb <= pc
03114             cmp ecx, pctemp
03115             jna dpthdbbc
03116             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03117             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
03118             jmp dpthdpaeth
03119 dpthdbbc:
03120             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
03121             mov cl, [esi + ebx]        // load Prior(x) into cl
03122             jmp dpthdpaeth
03123 dpthdabb:
03124             // pa <= pb; now test if pa <= pc
03125             cmp eax, pctemp
03126             jna dpthdabc
03127             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03128             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
03129             jmp dpthdpaeth
03130 dpthdabc:
03131             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
03132             mov cl, [edi + edx]  // load Raw(x-bpp) into cl
03133 dpthdpaeth:
03134             inc ebx
03135             inc edx
03136             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
03137             add [edi + ebx - 1], cl
03138             cmp ebx, FullLength
03139             jb dpthdlp
03140 dpthdend:
03141          } // end _asm block
03142       }
03143       return;                   // No need to go further with this one
03144    }                         // end switch ( bpp )
03145    _asm
03146    {
03147          // MMX acceleration complete now do clean-up
03148          // Check if any remaining bytes left to decode
03149          mov ebx, MMXLength
03150          cmp ebx, FullLength
03151          jnb dpthend
03152          mov edi, row
03153          mov esi, prev_row
03154          // Do Paeth decode for remaining bytes
03155          mov edx, ebx
03156          xor ecx, ecx         // zero ecx before using cl & cx in loop below
03157          sub edx, bpp         // Set edx = ebx - bpp
03158 dpthlp2:
03159          xor eax, eax
03160          // pav = p - a = (a + b - c) - a = b - c
03161          mov al, [esi + ebx]  // load Prior(x) into al
03162          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
03163          sub eax, ecx         // subtract Prior(x-bpp)
03164          mov patemp, eax      // Save pav for later use
03165          xor eax, eax
03166          // pbv = p - b = (a + b - c) - b = a - c
03167          mov al, [edi + edx]  // load Raw(x-bpp) into al
03168          sub eax, ecx         // subtract Prior(x-bpp)
03169          mov ecx, eax
03170          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
03171          add eax, patemp      // pcv = pav + pbv
03172          // pc = abs(pcv)
03173          test eax, 0x80000000
03174          jz dpthpca2
03175          neg eax              // reverse sign of neg values
03176 dpthpca2:
03177          mov pctemp, eax      // save pc for later use
03178          // pb = abs(pbv)
03179          test ecx, 0x80000000
03180          jz dpthpba2
03181          neg ecx              // reverse sign of neg values
03182 dpthpba2:
03183          mov pbtemp, ecx      // save pb for later use
03184          // pa = abs(pav)
03185          mov eax, patemp
03186          test eax, 0x80000000
03187          jz dpthpaa2
03188          neg eax              // reverse sign of neg values
03189 dpthpaa2:
03190          mov patemp, eax      // save pa for later use
03191          // test if pa <= pb
03192          cmp eax, ecx
03193          jna dpthabb2
03194          // pa > pb; now test if pb <= pc
03195          cmp ecx, pctemp
03196          jna dpthbbc2
03197          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03198          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
03199          jmp dpthpaeth2
03200 dpthbbc2:
03201          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
03202          mov cl, [esi + ebx]        // load Prior(x) into cl
03203          jmp dpthpaeth2
03204 dpthabb2:
03205          // pa <= pb; now test if pa <= pc
03206          cmp eax, pctemp
03207          jna dpthabc2
03208          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
03209          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
03210          jmp dpthpaeth2
03211 dpthabc2:
03212          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
03213          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
03214 dpthpaeth2:
03215          inc ebx
03216          inc edx
03217          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
03218          add [edi + ebx - 1], cl
03219          cmp ebx, FullLength
03220          jb dpthlp2
03221 dpthend:
03222          emms             // End MMX instructions; prep for possible FP instrs.
03223    } // end _asm block
03224 }
03225 
03226 // Optimized code for PNG Sub filter decoder
03227 void /* PRIVATE */
03228 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
03229 {
03230    //int test;
03231    int bpp;
03232    png_uint_32 FullLength;
03233    png_uint_32 MMXLength;
03234    int diff;
03235 
03236    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
03237    FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
03238    _asm {
03239         mov edi, row
03240         mov esi, edi               // lp = row
03241         add edi, bpp               // rp = row + bpp
03242         xor eax, eax
03243         // get # of bytes to alignment
03244         mov diff, edi               // take start of row
03245         add diff, 0xf               // add 7 + 8 to incr past
03246                                         // alignment boundary
03247         xor ebx, ebx
03248         and diff, 0xfffffff8        // mask to alignment boundary
03249         sub diff, edi               // subtract from start ==> value
03250                                         //  ebx at alignment
03251         jz dsubgo
03252         // fix alignment
03253 dsublp1:
03254         mov al, [esi+ebx]
03255         add [edi+ebx], al
03256         inc ebx
03257         cmp ebx, diff
03258         jb dsublp1
03259 dsubgo:
03260         mov ecx, FullLength
03261         mov edx, ecx
03262         sub edx, ebx                  // subtract alignment fix
03263         and edx, 0x00000007           // calc bytes over mult of 8
03264         sub ecx, edx                  // drop over bytes from length
03265         mov MMXLength, ecx
03266    } // end _asm block
03267 
03268    // Now do the math for the rest of the row
03269    switch ( bpp )
03270    {
03271         case 3:
03272         {
03273          ActiveMask.use  = 0x0000ffffff000000;
03274          ShiftBpp.use = 24;       // == 3 * 8
03275          ShiftRem.use  = 40;      // == 64 - 24
03276          _asm {
03277             mov edi, row
03278             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
03279             mov esi, edi              // lp = row
03280             add edi, bpp          // rp = row + bpp
03281             movq mm6, mm7
03282             mov ebx, diff
03283             psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
03284                                   // byte group
03285             // PRIME the pump (load the first Raw(x-bpp) data set
03286             movq mm1, [edi+ebx-8]
03287 dsub3lp:
03288             psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
03289                           // no need for mask; shift clears inactive bytes
03290             // Add 1st active group
03291             movq mm0, [edi+ebx]
03292             paddb mm0, mm1
03293             // Add 2nd active group
03294             movq mm1, mm0         // mov updated Raws to mm1
03295             psllq mm1, ShiftBpp   // shift data to position correctly
03296             pand mm1, mm7         // mask to use only 2nd active group
03297             paddb mm0, mm1
03298             // Add 3rd active group
03299             movq mm1, mm0         // mov updated Raws to mm1
03300             psllq mm1, ShiftBpp   // shift data to position correctly
03301             pand mm1, mm6         // mask to use only 3rd active group
03302             add ebx, 8
03303             paddb mm0, mm1
03304             cmp ebx, MMXLength
03305             movq [edi+ebx-8], mm0     // Write updated Raws back to array
03306             // Prep for doing 1st add at top of loop
03307             movq mm1, mm0
03308             jb dsub3lp
03309          } // end _asm block
03310       }
03311       break;
03312 
03313       case 1:
03314       {
03315          // Placed here just in case this is a duplicate of the
03316          // non-MMX code for the SUB filter in png_read_filter_row below
03317          //
03318          //         png_bytep rp;
03319          //         png_bytep lp;
03320          //         png_uint_32 i;
03321          //         bpp = (row_info->pixel_depth + 7) >> 3;
03322          //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
03323          //            i < row_info->rowbytes; i++, rp++, lp++)
03324          //      {
03325          //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
03326          //      }
03327          _asm {
03328             mov ebx, diff
03329             mov edi, row
03330             cmp ebx, FullLength
03331             jnb dsub1end
03332             mov esi, edi          // lp = row
03333             xor eax, eax
03334             add edi, bpp      // rp = row + bpp
03335 dsub1lp:
03336             mov al, [esi+ebx]
03337             add [edi+ebx], al
03338             inc ebx
03339             cmp ebx, FullLength
03340             jb dsub1lp
03341 dsub1end:
03342          } // end _asm block
03343       }
03344       return;
03345 
03346       case 6:
03347       case 7:
03348       case 4:
03349       case 5:
03350       {
03351          ShiftBpp.use = bpp << 3;
03352          ShiftRem.use = 64 - ShiftBpp.use;
03353          _asm {
03354             mov edi, row
03355             mov ebx, diff
03356             mov esi, edi               // lp = row
03357             add edi, bpp           // rp = row + bpp
03358             // PRIME the pump (load the first Raw(x-bpp) data set
03359             movq mm1, [edi+ebx-8]
03360 dsub4lp:
03361             psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
03362                           // no need for mask; shift clears inactive bytes
03363             movq mm0, [edi+ebx]
03364             paddb mm0, mm1
03365             // Add 2nd active group
03366             movq mm1, mm0          // mov updated Raws to mm1
03367             psllq mm1, ShiftBpp    // shift data to position correctly
03368                                    // there is no need for any mask
03369                                    // since shift clears inactive bits/bytes
03370             add ebx, 8
03371             paddb mm0, mm1
03372             cmp ebx, MMXLength
03373             movq [edi+ebx-8], mm0
03374             movq mm1, mm0          // Prep for doing 1st add at top of loop
03375             jb dsub4lp
03376          } // end _asm block
03377       }
03378       break;
03379 
03380       case 2:
03381       {
03382          ActiveMask.use  = 0x00000000ffff0000;
03383          ShiftBpp.use = 16;       // == 2 * 8
03384          ShiftRem.use = 48;       // == 64 - 16
03385          _asm {
03386             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
03387             mov ebx, diff
03388             movq mm6, mm7
03389             mov edi, row
03390             psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
03391                                     //  byte group
03392             mov esi, edi            // lp = row
03393             movq mm5, mm6
03394             add edi, bpp            // rp = row + bpp
03395             psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
03396                                     //  byte group
03397             // PRIME the pump (load the first Raw(x-bpp) data set
03398             movq mm1, [edi+ebx-8]
03399 dsub2lp:
03400             // Add 1st active group
03401             psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
03402                                     // no need for mask; shift clears inactive
03403                                     //  bytes
03404             movq mm0, [edi+ebx]
03405             paddb mm0, mm1
03406             // Add 2nd active group
03407             movq mm1, mm0           // mov updated Raws to mm1
03408             psllq mm1, ShiftBpp     // shift data to position correctly
03409             pand mm1, mm7           // mask to use only 2nd active group
03410             paddb mm0, mm1
03411             // Add 3rd active group
03412             movq mm1, mm0           // mov updated Raws to mm1
03413             psllq mm1, ShiftBpp     // shift data to position correctly
03414             pand mm1, mm6           // mask to use only 3rd active group
03415             paddb mm0, mm1
03416             // Add 4th active group
03417             movq mm1, mm0           // mov updated Raws to mm1
03418             psllq mm1, ShiftBpp     // shift data to position correctly
03419             pand mm1, mm5           // mask to use only 4th active group
03420             add ebx, 8
03421             paddb mm0, mm1
03422             cmp ebx, MMXLength
03423             movq [edi+ebx-8], mm0   // Write updated Raws back to array
03424             movq mm1, mm0           // Prep for doing 1st add at top of loop
03425             jb dsub2lp
03426          } // end _asm block
03427       }
03428       break;
03429       case 8:
03430       {
03431          _asm {
03432             mov edi, row
03433             mov ebx, diff
03434             mov esi, edi            // lp = row
03435             add edi, bpp            // rp = row + bpp
03436             mov ecx, MMXLength
03437             movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
03438                                     // Raw(x-bpp) data set
03439             and ecx, 0x0000003f     // calc bytes over mult of 64
03440 dsub8lp:
03441             movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
03442             paddb mm0, mm7
03443             movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
03444             movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
03445                                    // Now mm0 will be used as Raw(x-bpp) for
03446                                    // the 2nd group of 8 bytes.  This will be
03447                                    // repeated for each group of 8 bytes with
03448                                    // the 8th group being used as the Raw(x-bpp)
03449                                    // for the 1st group of the next loop.
03450             paddb mm1, mm0
03451             movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
03452             movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
03453             paddb mm2, mm1
03454             movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
03455             movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
03456             paddb mm3, mm2
03457             movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
03458             movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
03459             paddb mm4, mm3
03460             movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
03461             movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
03462             paddb mm5, mm4
03463             movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
03464             movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
03465             paddb mm6, mm5
03466             movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
03467             movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
03468             add ebx, 64
03469             paddb mm7, mm6
03470             cmp ebx, ecx
03471             movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
03472             jb dsub8lp
03473             cmp ebx, MMXLength
03474             jnb dsub8lt8
03475 dsub8lpA:
03476             movq mm0, [edi+ebx]
03477             add ebx, 8
03478             paddb mm0, mm7
03479             cmp ebx, MMXLength
03480             movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
03481             movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
03482                                     // be the new Raw(x-bpp) for the next loop
03483             jb dsub8lpA
03484 dsub8lt8:
03485          } // end _asm block
03486       }
03487       break;
03488 
03489       default:                // bpp greater than 8 bytes
03490       {
03491          _asm {
03492             mov ebx, diff
03493             mov edi, row
03494             mov esi, edi           // lp = row
03495             add edi, bpp           // rp = row + bpp
03496 dsubAlp:
03497             movq mm0, [edi+ebx]
03498             movq mm1, [esi+ebx]
03499             add ebx, 8
03500             paddb mm0, mm1
03501             cmp ebx, MMXLength
03502             movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
03503                                    //  add ebx
03504             jb dsubAlp
03505          } // end _asm block
03506       }
03507       break;
03508 
03509    } // end switch ( bpp )
03510 
03511    _asm {
03512         mov ebx, MMXLength
03513         mov edi, row
03514         cmp ebx, FullLength
03515         jnb dsubend
03516         mov esi, edi               // lp = row
03517         xor eax, eax
03518         add edi, bpp               // rp = row + bpp
03519 dsublp2:
03520         mov al, [esi+ebx]
03521         add [edi+ebx], al
03522         inc ebx
03523         cmp ebx, FullLength
03524         jb dsublp2
03525 dsubend:
03526         emms             // End MMX instructions; prep for possible FP instrs.
03527    } // end _asm block
03528 }
03529 
03530 // Optimized code for PNG Up filter decoder
03531 void /* PRIVATE */
03532 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
03533    png_bytep prev_row)
03534 {
03535    png_uint_32 len;
03536    len  = row_info->rowbytes;       // # of bytes to filter
03537    _asm {
03538       mov edi, row
03539       // get # of bytes to alignment
03540       mov ecx, edi
03541       xor ebx, ebx
03542       add ecx, 0x7
03543       xor eax, eax
03544       and ecx, 0xfffffff8
03545       mov esi, prev_row
03546       sub ecx, edi
03547       jz dupgo
03548       // fix alignment
03549 duplp1:
03550       mov al, [edi+ebx]
03551       add al, [esi+ebx]
03552       inc ebx
03553       cmp ebx, ecx
03554       mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
03555       jb duplp1
03556 dupgo:
03557       mov ecx, len
03558       mov edx, ecx
03559       sub edx, ebx                  // subtract alignment fix
03560       and edx, 0x0000003f           // calc bytes over mult of 64
03561       sub ecx, edx                  // drop over bytes from length
03562       // Unrolled loop - use all MMX registers and interleave to reduce
03563       // number of branch instructions (loops) and reduce partial stalls
03564 duploop:
03565       movq mm1, [esi+ebx]
03566       movq mm0, [edi+ebx]
03567       movq mm3, [esi+ebx+8]
03568       paddb mm0, mm1
03569       movq mm2, [edi+ebx+8]
03570       movq [edi+ebx], mm0
03571       paddb mm2, mm3
03572       movq mm5, [esi+ebx+16]
03573       movq [edi+ebx+8], mm2
03574       movq mm4, [edi+ebx+16]
03575       movq mm7, [esi+ebx+24]
03576       paddb mm4, mm5
03577       movq mm6, [edi+ebx+24]
03578       movq [edi+ebx+16], mm4
03579       paddb mm6, mm7
03580       movq mm1, [esi+ebx+32]
03581       movq [edi+ebx+24], mm6
03582       movq mm0, [edi+ebx+32]
03583       movq mm3, [esi+ebx+40]
03584       paddb mm0, mm1
03585       movq mm2, [edi+ebx+40]
03586       movq [edi+ebx+32], mm0
03587       paddb mm2, mm3
03588       movq mm5, [esi+ebx+48]
03589       movq [edi+ebx+40], mm2
03590       movq mm4, [edi+ebx+48]
03591       movq mm7, [esi+ebx+56]
03592       paddb mm4, mm5
03593       movq mm6, [edi+ebx+56]
03594       movq [edi+ebx+48], mm4
03595       add ebx, 64
03596       paddb mm6, mm7
03597       cmp ebx, ecx
03598       movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
03599                                      // -8 to offset add ebx
03600       jb duploop
03601 
03602       cmp edx, 0                     // Test for bytes over mult of 64
03603       jz dupend
03604 
03605 
03606       // 2 lines added by lcreeve@netins.net
03607       // (mail 11 Jul 98 in png-implement list)
03608       cmp edx, 8 //test for less than 8 bytes
03609       jb duplt8
03610 
03611 
03612       add ecx, edx
03613       and edx, 0x00000007           // calc bytes over mult of 8
03614       sub ecx, edx                  // drop over bytes from length
03615       jz duplt8
03616       // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
03617 duplpA:
03618       movq mm1, [esi+ebx]
03619       movq mm0, [edi+ebx]
03620       add ebx, 8
03621       paddb mm0, mm1
03622       cmp ebx, ecx
03623       movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
03624       jb duplpA
03625       cmp edx, 0            // Test for bytes over mult of 8
03626       jz dupend
03627 duplt8:
03628       xor eax, eax
03629       add ecx, edx          // move over byte count into counter
03630       // Loop using x86 registers to update remaining bytes
03631 duplp2:
03632       mov al, [edi + ebx]
03633       add al, [esi + ebx]
03634       inc ebx
03635       cmp ebx, ecx
03636       mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
03637       jb duplp2
03638 dupend:
03639       // Conversion of filtered row completed
03640       emms          // End MMX instructions; prep for possible FP instrs.
03641    } // end _asm block
03642 }
03643 
03644 
03645 // Optimized png_read_filter_row routines
03646 void /* PRIVATE */
03647 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
03648    row, png_bytep prev_row, int filter)
03649 {
03650 #ifdef PNG_DEBUG
03651    char filnm[10];
03652 #endif
03653 
03654    if (mmx_supported == 2) {
03655        /* this should have happened in png_init_mmx_flags() already */
03656        png_warning(png_ptr, "asm_flags may not have been initialized");
03657        png_mmx_support();
03658    }
03659 
03660 #ifdef PNG_DEBUG
03661    png_debug(1, "in png_read_filter_row\n");
03662    switch (filter)
03663    {
03664       case 0: sprintf(filnm, "none");
03665          break;
03666       case 1: sprintf(filnm, "sub-%s",
03667         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
03668          break;
03669       case 2: sprintf(filnm, "up-%s",
03670         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
03671          break;
03672       case 3: sprintf(filnm, "avg-%s",
03673         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
03674          break;
03675       case 4: sprintf(filnm, "Paeth-%s",
03676         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
03677          break;
03678       default: sprintf(filnm, "unknw");
03679          break;
03680    }
03681    png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
03682    png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
03683       (int)((row_info->pixel_depth + 7) >> 3));
03684    png_debug1(0,"len=%8d, ", row_info->rowbytes);
03685 #endif /* PNG_DEBUG */
03686 
03687    switch (filter)
03688    {
03689       case PNG_FILTER_VALUE_NONE:
03690          break;
03691 
03692       case PNG_FILTER_VALUE_SUB:
03693       {
03694          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
03695              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03696              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03697          {
03698             png_read_filter_row_mmx_sub(row_info, row);
03699          }
03700          else
03701          {
03702             png_uint_32 i;
03703             png_uint_32 istop = row_info->rowbytes;
03704             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03705             png_bytep rp = row + bpp;
03706             png_bytep lp = row;
03707 
03708             for (i = bpp; i < istop; i++)
03709             {
03710                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
03711                rp++;
03712             }
03713          }
03714          break;
03715       }
03716 
03717       case PNG_FILTER_VALUE_UP:
03718       {
03719          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
03720              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03721              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03722          {
03723             png_read_filter_row_mmx_up(row_info, row, prev_row);
03724          }
03725          else
03726          {
03727             png_uint_32 i;
03728             png_uint_32 istop = row_info->rowbytes;
03729             png_bytep rp = row;
03730             png_bytep pp = prev_row;
03731 
03732             for (i = 0; i < istop; ++i)
03733             {
03734                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
03735                rp++;
03736             }
03737          }
03738          break;
03739       }
03740 
03741       case PNG_FILTER_VALUE_AVG:
03742       {
03743          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
03744              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03745              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03746          {
03747             png_read_filter_row_mmx_avg(row_info, row, prev_row);
03748          }
03749          else
03750          {
03751             png_uint_32 i;
03752             png_bytep rp = row;
03753             png_bytep pp = prev_row;
03754             png_bytep lp = row;
03755             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03756             png_uint_32 istop = row_info->rowbytes - bpp;
03757 
03758             for (i = 0; i < bpp; i++)
03759             {
03760                *rp = (png_byte)(((int)(*rp) +
03761                   ((int)(*pp++) >> 1)) & 0xff);
03762                rp++;
03763             }
03764 
03765             for (i = 0; i < istop; i++)
03766             {
03767                *rp = (png_byte)(((int)(*rp) +
03768                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
03769                rp++;
03770             }
03771          }
03772          break;
03773       }
03774 
03775       case PNG_FILTER_VALUE_PAETH:
03776       {
03777          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
03778              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
03779              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
03780          {
03781             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
03782          }
03783          else
03784          {
03785             png_uint_32 i;
03786             png_bytep rp = row;
03787             png_bytep pp = prev_row;
03788             png_bytep lp = row;
03789             png_bytep cp = prev_row;
03790             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
03791             png_uint_32 istop=row_info->rowbytes - bpp;
03792 
03793             for (i = 0; i < bpp; i++)
03794             {
03795                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
03796                rp++;
03797             }
03798 
03799             for (i = 0; i < istop; i++)   // use leftover rp,pp
03800             {
03801                int a, b, c, pa, pb, pc, p;
03802 
03803                a = *lp++;
03804                b = *pp++;
03805                c = *cp++;
03806 
03807                p = b - c;
03808                pc = a - c;
03809 
03810 #ifdef PNG_USE_ABS
03811                pa = abs(p);
03812                pb = abs(pc);
03813                pc = abs(p + pc);
03814 #else
03815                pa = p < 0 ? -p : p;
03816                pb = pc < 0 ? -pc : pc;
03817                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
03818 #endif
03819 
03820                /*
03821                   if (pa <= pb && pa <= pc)
03822                      p = a;
03823                   else if (pb <= pc)
03824                      p = b;
03825                   else
03826                      p = c;
03827                 */
03828 
03829                p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
03830 
03831                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
03832                rp++;
03833             }
03834          }
03835          break;
03836       }
03837 
03838       default:
03839          png_warning(png_ptr, "Ignoring bad row filter type");
03840          *row=0;
03841          break;
03842    }
03843 }
03844 
03845 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */