Back to index

lightning-sunbird  0.9+nobinonly
fbmmx.c
Go to the documentation of this file.
00001 /*
00002  * Copyright © 2004 Red Hat, Inc.
00003  * Copyright © 2004 Nicholas Miell
00004  * Copyright © 2005 Trolltech AS
00005  *
00006  * Permission to use, copy, modify, distribute, and sell this software and its
00007  * documentation for any purpose is hereby granted without fee, provided that
00008  * the above copyright notice appear in all copies and that both that
00009  * copyright notice and this permission notice appear in supporting
00010  * documentation, and that the name of Red Hat not be used in advertising or
00011  * publicity pertaining to distribution of the software without specific,
00012  * written prior permission.  Red Hat makes no representations about the
00013  * suitability of this software for any purpose.  It is provided "as is"
00014  * without express or implied warranty.
00015  *
00016  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
00017  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
00018  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
00019  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
00020  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
00021  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
00022  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
00023  * SOFTWARE.
00024  *
00025  * Author:  Søren Sandmann (sandmann@redhat.com)
00026  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
00027  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 
00028  *
00029  * Based on work by Owen Taylor
00030  */
00031 
00032 
00033 #ifdef HAVE_CONFIG_H
00034 #include <config.h>
00035 #endif
00036 
00037 #include <assert.h>
00038 #include "fbpict.h"
00039 #include "pixman-xserver-compat.h"
00040 #include "fbmmx.h"
00041 
00042 #if defined(__amd64__) || defined(__x86_64__)
00043 #define USE_SSE
00044 #endif
00045 
00046 #include <mmintrin.h>
00047 #ifdef USE_SSE
00048 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
00049 #endif
00050 
00051 #ifdef RENDER
00052 
00053 #include "fbpict.h"
00054 
00055 #define noVERBOSE
00056 
00057 #ifdef VERBOSE
00058 #define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)
00059 #else
00060 #define CHECKPOINT()
00061 #endif
00062 
00063 /* Notes about writing mmx code
00064  *
00065  * give memory operands as the second operand. If you give it as the
00066  * first, gcc will first load it into a register, then use that
00067  * register
00068  *
00069  *   ie. use
00070  *
00071  *         _mm_mullo_pi16 (x, mmx_constant);
00072  *
00073  *   not
00074  *
00075  *         _mm_mullo_pi16 (mmx_constant, x);
00076  *
00077  * Also try to minimize dependencies. i.e. when you need a value, try
00078  * to calculate it from a value that was calculated as early as
00079  * possible.
00080  */
00081 
00082 /* --------------- MMX primitivess ------------------------------------ */
00083 
00084 typedef unsigned long long ullong;
00085 
00086 typedef struct
00087 {
00088     ullong mmx_4x00ff;
00089     ullong mmx_4x0080;
00090     ullong mmx_565_rgb;
00091     ullong mmx_565_unpack_multiplier;
00092     ullong mmx_565_r;
00093     ullong mmx_565_g;
00094     ullong mmx_565_b;
00095     ullong mmx_mask_0;
00096     ullong mmx_mask_1;
00097     ullong mmx_mask_2;
00098     ullong mmx_mask_3;
00099     ullong mmx_full_alpha;
00100     ullong mmx_ffff0000ffff0000;
00101     ullong mmx_0000ffff00000000;
00102     ullong mmx_000000000000ffff;
00103 } MMXData;
00104 
00105 static const MMXData c =
00106 {
00107     .mmx_4x00ff =                  0x00ff00ff00ff00ffULL,
00108     .mmx_4x0080 =                  0x0080008000800080ULL,
00109     .mmx_565_rgb =                 0x000001f0003f001fULL,
00110     .mmx_565_r =                   0x000000f800000000ULL,
00111     .mmx_565_g =                   0x0000000000fc0000ULL,
00112     .mmx_565_b =                   0x00000000000000f8ULL,
00113     .mmx_mask_0 =                  0xffffffffffff0000ULL,
00114     .mmx_mask_1 =                  0xffffffff0000ffffULL,
00115     .mmx_mask_2 =                  0xffff0000ffffffffULL,
00116     .mmx_mask_3 =                  0x0000ffffffffffffULL,
00117     .mmx_full_alpha =                     0x00ff000000000000ULL,
00118     .mmx_565_unpack_multiplier =   0x0000008404100840ULL,
00119     .mmx_ffff0000ffff0000 =        0xffff0000ffff0000ULL,
00120     .mmx_0000ffff00000000 =        0x0000ffff00000000ULL,
00121     .mmx_000000000000ffff =        0x000000000000ffffULL,
00122 };
00123 
00124 #define MC(x) ((__m64) c.mmx_##x)
00125 
00126 static __inline__ __m64
00127 shift (__m64 v, int s)
00128 {
00129     if (s > 0)
00130        return _mm_slli_si64 (v, s);
00131     else if (s < 0)
00132        return _mm_srli_si64 (v, -s);
00133     else
00134        return v;
00135 }
00136 
00137 static __inline__ __m64
00138 negate (__m64 mask)
00139 {
00140     return _mm_xor_si64 (mask, MC(4x00ff));
00141 }
00142 
00143 static __inline__ __m64
00144 pix_multiply (__m64 a, __m64 b)
00145 {
00146     __m64 res;
00147     
00148     res = _mm_mullo_pi16 (a, b);
00149     res = _mm_adds_pu16 (res, MC(4x0080));
00150     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
00151     res = _mm_srli_pi16 (res, 8);
00152     
00153     return res;
00154 }
00155 
00156 static __inline__ __m64
00157 pix_add (__m64 a, __m64 b)
00158 {
00159     return  _mm_adds_pu8 (a, b);
00160 }
00161 
00162 #ifdef USE_SSE
00163 
00164 static __inline__ __m64
00165 expand_alpha (__m64 pixel)
00166 {
00167     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
00168 }
00169 
00170 static __inline__ __m64
00171 expand_alpha_rev (__m64 pixel)
00172 {
00173     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
00174 }    
00175 
00176 static __inline__ __m64
00177 invert_colors (__m64 pixel)
00178 {
00179     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
00180 }
00181 
00182 #else
00183 
00184 static __inline__ __m64
00185 expand_alpha (__m64 pixel)
00186 {
00187     __m64 t1, t2;
00188     
00189     t1 = shift (pixel, -48);
00190     t2 = shift (t1, 16);
00191     t1 = _mm_or_si64 (t1, t2);
00192     t2 = shift (t1, 32);
00193     t1 = _mm_or_si64 (t1, t2);
00194 
00195     return t1;
00196 }
00197 
00198 static __inline__ __m64
00199 expand_alpha_rev (__m64 pixel)
00200 {
00201     __m64 t1, t2;
00202 
00203     /* move alpha to low 16 bits and zero the rest */
00204     t1 = shift (pixel,  48);
00205     t1 = shift (t1, -48);
00206 
00207     t2 = shift (t1, 16);
00208     t1 = _mm_or_si64 (t1, t2);
00209     t2 = shift (t1, 32);
00210     t1 = _mm_or_si64 (t1, t2);
00211 
00212     return t1;
00213 }
00214 
00215 static __inline__ __m64
00216 invert_colors (__m64 pixel)
00217 {
00218     __m64 x, y, z;
00219 
00220     x = y = z = pixel;
00221 
00222     x = _mm_and_si64 (x, MC(ffff0000ffff0000));
00223     y = _mm_and_si64 (y, MC(000000000000ffff));
00224     z = _mm_and_si64 (z, MC(0000ffff00000000));
00225 
00226     y = shift (y, 32);
00227     z = shift (z, -32);
00228 
00229     x = _mm_or_si64 (x, y);
00230     x = _mm_or_si64 (x, z);
00231 
00232     return x;
00233 }
00234 
00235 #endif
00236 
00237 static __inline__ __m64
00238 over (__m64 src, __m64 srca, __m64 dest)
00239 {
00240     return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
00241 }
00242 
00243 static __inline__ __m64
00244 over_rev_non_pre (__m64 src, __m64 dest)
00245 {
00246     __m64 srca = expand_alpha (src);
00247     __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
00248     
00249     return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
00250 }
00251 
00252 static __inline__ __m64
00253 in (__m64 src,
00254     __m64 mask)
00255 {
00256     return pix_multiply (src, mask);
00257 }
00258 
00259 static __inline__ __m64
00260 in_over (__m64 src,
00261         __m64 srca,
00262         __m64 mask,
00263         __m64 dest)
00264 {
00265     return over(in(src, mask), pix_multiply(srca, mask), dest);
00266 }
00267 
00268 static __inline__ __m64
00269 load8888 (CARD32 v)
00270 {
00271     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
00272 }
00273 
00274 static __inline__ __m64
00275 pack8888 (__m64 lo, __m64 hi)
00276 {
00277     return _mm_packs_pu16 (lo, hi);
00278 }
00279 
00280 static __inline__ CARD32
00281 store8888 (__m64 v)
00282 {
00283     return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
00284 }
00285 
00286 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
00287  *
00288  *    00RR00GG00BB
00289  * 
00290  * --- Expanding 565 in the low word ---
00291  * 
00292  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
00293  * m = m & (01f0003f001f);
00294  * m = m * (008404100840);
00295  * m = m >> 8;
00296  * 
00297  * Note the trick here - the top word is shifted by another nibble to
00298  * avoid it bumping into the middle word
00299  */
00300 static __inline__ __m64
00301 expand565 (__m64 pixel, int pos)
00302 {
00303     __m64 p = pixel;
00304     __m64 t1, t2;
00305     
00306     /* move pixel to low 16 bit and zero the rest */
00307     p = shift (shift (p, (3 - pos) * 16), -48); 
00308     
00309     t1 = shift (p, 36 - 11);
00310     t2 = shift (p, 16 - 5);
00311     
00312     p = _mm_or_si64 (t1, p);
00313     p = _mm_or_si64 (t2, p);
00314     p = _mm_and_si64 (p, MC(565_rgb));
00315     
00316     pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
00317     return _mm_srli_pi16 (pixel, 8);
00318 }
00319 
00320 static __inline__ __m64
00321 expand8888 (__m64 in, int pos)
00322 {
00323     if (pos == 0)
00324        return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
00325     else
00326        return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
00327 }
00328 
00329 static __inline__ __m64
00330 pack565 (__m64 pixel, __m64 target, int pos)
00331 {
00332     __m64 p = pixel;
00333     __m64 t = target;
00334     __m64 r, g, b;
00335     
00336     r = _mm_and_si64 (p, MC(565_r));
00337     g = _mm_and_si64 (p, MC(565_g));
00338     b = _mm_and_si64 (p, MC(565_b));
00339     
00340     r = shift (r, - (32 - 8) + pos * 16);
00341     g = shift (g, - (16 - 3) + pos * 16);
00342     b = shift (b, - (0  + 3) + pos * 16);
00343     
00344     if (pos == 0)
00345        t = _mm_and_si64 (t, MC(mask_0));
00346     else if (pos == 1)
00347        t = _mm_and_si64 (t, MC(mask_1));
00348     else if (pos == 2)
00349        t = _mm_and_si64 (t, MC(mask_2));
00350     else if (pos == 3)
00351        t = _mm_and_si64 (t, MC(mask_3));
00352     
00353     p = _mm_or_si64 (r, t);
00354     p = _mm_or_si64 (g, p);
00355     
00356     return _mm_or_si64 (b, p);
00357 }
00358 
00359 static __inline__ __m64
00360 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
00361 {
00362     x = _mm_mullo_pi16 (x, a);
00363     y = _mm_mullo_pi16 (y, b);
00364     x = _mm_adds_pu16 (x, MC(4x0080));
00365     x = _mm_adds_pu16 (x, y);
00366     x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
00367     x = _mm_srli_pi16 (x, 8);
00368 
00369     return x;
00370 }
00371 
00372 /* --------------- MMX code patch for fbcompose.c --------------------- */
00373 
00374 static FASTCALL void
00375 mmxCombineMaskU (CARD32 *src, const CARD32 *mask, int width)
00376 {
00377     const CARD32 *end = mask + width;
00378     while (mask < end) {
00379        CARD32 mmask = *mask;
00380        CARD32 maska = mmask >> 24;
00381        if (maska == 0) {
00382            *src = 0;
00383        } else if (maska != 0xff) {
00384            __m64 a = load8888(mmask);
00385            __m64 s = load8888(*src);
00386            a = expand_alpha(a);
00387            s = pix_multiply(s, a);
00388            *src = store8888(s);
00389        }
00390         ++src;
00391         ++mask;
00392     }
00393     _mm_empty();
00394 }
00395 
00396 
00397 static FASTCALL void
00398 mmxCombineOverU (CARD32 *dest, const CARD32 *src, int width)
00399 {
00400     const CARD32 *end = dest + width;
00401 
00402     while (dest < end) {
00403        CARD32 ssrc = *src;
00404        CARD32 a = ssrc >> 24;
00405        if (a == 0xff) {
00406            *dest = ssrc;
00407        } else if (a) {
00408            __m64 s, sa;
00409            s = load8888(ssrc);
00410            sa = expand_alpha(s);
00411            *dest = store8888(over(s, sa, load8888(*dest)));
00412        }
00413         ++dest;
00414         ++src;
00415     }
00416     _mm_empty();
00417 }
00418 
00419 static FASTCALL void
00420 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
00421 {
00422     const CARD32 *end = dest + width;
00423 
00424     while (dest < end) {
00425        __m64 d, da;
00426        d = load8888(*dest);
00427        da = expand_alpha(d);
00428        *dest = store8888(over (d, da, load8888(*src)));
00429         ++dest;
00430         ++src;
00431     }
00432     _mm_empty();
00433 }
00434 
00435 static FASTCALL void
00436 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
00437 {
00438     const CARD32 *end = dest + width;
00439 
00440     while (dest < end) {
00441         __m64 x, a;
00442         x = load8888(*src);
00443         a = load8888(*dest);
00444         a = expand_alpha(a);
00445         x = pix_multiply(x, a);
00446         *dest = store8888(x);
00447         ++dest;
00448         ++src;
00449     }
00450     _mm_empty();
00451 }
00452 
00453 static FASTCALL void
00454 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
00455 {
00456     const CARD32 *end = dest + width;
00457 
00458     while (dest < end) {
00459         __m64 x, a;
00460         x = load8888(*dest);
00461         a = load8888(*src);
00462         a = expand_alpha(a);
00463         x = pix_multiply(x, a);
00464         *dest = store8888(x);
00465         ++dest;
00466         ++src;
00467     }
00468     _mm_empty();
00469 }
00470 
00471 static FASTCALL void
00472 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
00473 {
00474     const CARD32 *end = dest + width;
00475 
00476     while (dest < end) {
00477         __m64 x, a;
00478         x = load8888(*src);
00479         a = load8888(*dest);
00480         a = expand_alpha(a);
00481         a = negate(a);
00482         x = pix_multiply(x, a);
00483         *dest = store8888(x);
00484         ++dest;
00485         ++src;
00486     }
00487     _mm_empty();
00488 }
00489 
00490 static FASTCALL void
00491 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
00492 {
00493     const CARD32 *end = dest + width;
00494 
00495     while (dest < end) {
00496         __m64 x, a;
00497         x = load8888(*dest);
00498         a = load8888(*src);
00499         a = expand_alpha(a);
00500         a = negate(a);
00501         x = pix_multiply(x, a);
00502         *dest = store8888(x);
00503         ++dest;
00504         ++src;
00505     }
00506     _mm_empty();
00507 }
00508 
00509 static FASTCALL void
00510 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
00511 {
00512     const CARD32 *end = dest + width;
00513 
00514     while (dest < end) {
00515         __m64 s, da, d, sia;
00516         s = load8888(*src);
00517         d = load8888(*dest);
00518         sia = expand_alpha(s);
00519         sia = negate(sia);
00520         da = expand_alpha(d);
00521         s = pix_add_mul (s, da, d, sia);
00522         *dest = store8888(s);
00523         ++dest;
00524         ++src;
00525     }
00526     _mm_empty();
00527 }
00528 
00529 static FASTCALL void
00530 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
00531 {
00532     const CARD32 *end;
00533 
00534     end = dest + width;
00535 
00536     while (dest < end) {
00537         __m64 s, dia, d, sa;
00538         s = load8888(*src);
00539         d = load8888(*dest);
00540         sa = expand_alpha(s);
00541         dia = expand_alpha(d);
00542         dia = negate(dia);
00543        s = pix_add_mul (s, dia, d, sa);
00544         *dest = store8888(s);
00545         ++dest;
00546         ++src;
00547     }
00548     _mm_empty();
00549 }
00550 
00551 static FASTCALL void
00552 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
00553 {
00554     const CARD32 *end = dest + width;
00555 
00556     while (dest < end) {
00557         __m64 s, dia, d, sia;
00558         s = load8888(*src);
00559         d = load8888(*dest);
00560         sia = expand_alpha(s);
00561         dia = expand_alpha(d);
00562         sia = negate(sia);
00563         dia = negate(dia);
00564        s = pix_add_mul (s, dia, d, sia);
00565         *dest = store8888(s);
00566         ++dest;
00567         ++src;
00568     }
00569     _mm_empty();
00570 }
00571 
00572 static FASTCALL void
00573 mmxCombineAddU (CARD32 *dest, const CARD32 *src, int width)
00574 {
00575     const CARD32 *end = dest + width;
00576     while (dest < end) {
00577         __m64 s, d;
00578        s = load8888(*src);
00579        d = load8888(*dest);
00580        s = pix_add(s, d);
00581        *dest = store8888(s);
00582         ++dest;
00583         ++src;
00584     }
00585     _mm_empty();
00586 }
00587 
00588 static FASTCALL void
00589 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
00590 {
00591     const CARD32 *end = dest + width;
00592     while (dest < end) {
00593         CARD32 s = *src;
00594         CARD32 d = *dest;
00595         __m64 ms = load8888(s);
00596         __m64 md = load8888(d);
00597         CARD32 sa = s >> 24;
00598         CARD32 da = ~d >> 24;
00599 
00600         if (sa > da) {
00601             __m64 msa = load8888(FbIntDiv(da, sa)<<24);
00602             msa = expand_alpha(msa);
00603             ms = pix_multiply(ms, msa);
00604         }
00605         md = pix_add(md, ms);
00606         *dest = store8888(md);
00607         ++src;
00608         ++dest;
00609     }
00610     _mm_empty();
00611 }
00612 
00613 
00614 static FASTCALL void
00615 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00616 {
00617     const CARD32 *end = src + width;
00618     while (src < end) {
00619         __m64 a = load8888(*mask);
00620         __m64 s = load8888(*src);
00621         s = pix_multiply(s, a);
00622         *dest = store8888(s);
00623         ++src;
00624         ++mask;
00625         ++dest;
00626     }
00627     _mm_empty();
00628 }
00629 
00630 static FASTCALL void
00631 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00632 {
00633     const CARD32 *end = src + width;
00634     while (src < end) {
00635         __m64 a = load8888(*mask);
00636         __m64 s = load8888(*src);
00637         __m64 d = load8888(*dest);
00638         __m64 sa = expand_alpha(s);
00639        
00640        *dest = store8888(in_over (s, sa, a, d));
00641        
00642         ++src;
00643         ++dest;
00644         ++mask;
00645     }
00646     _mm_empty();
00647 }
00648 
00649 static FASTCALL void
00650 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00651 {
00652     const CARD32 *end = src + width;
00653     while (src < end) {
00654         __m64 a = load8888(*mask);
00655         __m64 s = load8888(*src);
00656         __m64 d = load8888(*dest);
00657         __m64 da = expand_alpha(d);
00658 
00659        *dest = store8888(over (d, da, in (s, a)));
00660        
00661         ++src;
00662         ++dest;
00663         ++mask;
00664     }
00665     _mm_empty();
00666 }
00667 
00668 
00669 static FASTCALL void
00670 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00671 {
00672     const CARD32 *end = src + width;
00673     while (src < end) {
00674         __m64 a = load8888(*mask);
00675         __m64 s = load8888(*src);
00676         __m64 d = load8888(*dest);
00677         __m64 da = expand_alpha(d);
00678         s = pix_multiply(s, a);
00679         s = pix_multiply(s, da);
00680         *dest = store8888(s);
00681         ++src;
00682         ++dest;
00683         ++mask;
00684     }
00685     _mm_empty();
00686 }
00687 
00688 static FASTCALL void
00689 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00690 {
00691     const CARD32 *end = src + width;
00692     while (src < end) {
00693         __m64 a = load8888(*mask);
00694         __m64 s = load8888(*src);
00695         __m64 d = load8888(*dest);
00696         __m64 sa = expand_alpha(s);
00697         a = pix_multiply(a, sa);
00698         d = pix_multiply(d, a);
00699         *dest = store8888(d);
00700         ++src;
00701         ++dest;
00702         ++mask;
00703     }
00704     _mm_empty();
00705 }
00706 
00707 static FASTCALL void
00708 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00709 {
00710     const CARD32 *end = src + width;
00711     while (src < end) {
00712         __m64 a = load8888(*mask);
00713         __m64 s = load8888(*src);
00714         __m64 d = load8888(*dest);
00715         __m64 da = expand_alpha(d);
00716         da = negate(da);
00717         s = pix_multiply(s, a);
00718         s = pix_multiply(s, da);
00719         *dest = store8888(s);
00720         ++src;
00721         ++dest;
00722         ++mask;
00723     }
00724     _mm_empty();
00725 }
00726 
00727 static FASTCALL void
00728 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00729 {
00730     const CARD32 *end = src + width;
00731     while (src < end) {
00732         __m64 a = load8888(*mask);
00733         __m64 s = load8888(*src);
00734         __m64 d = load8888(*dest);
00735         __m64 sa = expand_alpha(s);
00736         a = pix_multiply(a, sa);
00737         a = negate(a);
00738         d = pix_multiply(d, a);
00739         *dest = store8888(d);
00740         ++src;
00741         ++dest;
00742         ++mask;
00743     }
00744     _mm_empty();
00745 }
00746 
00747 static FASTCALL void
00748 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00749 {
00750     const CARD32 *end = src + width;
00751     while (src < end) {
00752         __m64 a = load8888(*mask);
00753         __m64 s = load8888(*src);
00754         __m64 d = load8888(*dest);
00755         __m64 da = expand_alpha(d);
00756         __m64 sa = expand_alpha(s); 
00757         s = pix_multiply(s, a);
00758         a = pix_multiply(a, sa);
00759         a = negate(a);
00760        d = pix_add_mul (d, a, s, da);
00761         *dest = store8888(d);
00762         ++src;
00763         ++dest;
00764         ++mask;
00765     }
00766     _mm_empty();
00767 }
00768 
00769 static FASTCALL void
00770 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00771 {
00772     const CARD32 *end = src + width;
00773     while (src < end) {
00774         __m64 a = load8888(*mask);
00775         __m64 s = load8888(*src);
00776         __m64 d = load8888(*dest);
00777         __m64 da = expand_alpha(d);
00778         __m64 sa = expand_alpha(s);
00779         s = pix_multiply(s, a);
00780         a = pix_multiply(a, sa);
00781         da = negate(da);
00782        d = pix_add_mul (d, a, s, da);
00783         *dest = store8888(d);
00784         ++src;
00785         ++dest;
00786         ++mask;
00787     }
00788     _mm_empty();
00789 }
00790 
00791 static FASTCALL void
00792 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00793 {
00794     const CARD32 *end = src + width;
00795     while (src < end) {
00796         __m64 a = load8888(*mask);
00797         __m64 s = load8888(*src);
00798         __m64 d = load8888(*dest);
00799         __m64 da = expand_alpha(d);
00800         __m64 sa = expand_alpha(s);
00801         s = pix_multiply(s, a);
00802         a = pix_multiply(a, sa);
00803         da = negate(da);
00804         a = negate(a);
00805        d = pix_add_mul (d, a, s, da);
00806         *dest = store8888(d);
00807         ++src;
00808         ++dest;
00809         ++mask;
00810     }
00811     _mm_empty();
00812 }
00813 
00814 static FASTCALL void
00815 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
00816 {
00817     const CARD32 *end = src + width;
00818     while (src < end) {
00819         __m64 a = load8888(*mask);
00820         __m64 s = load8888(*src);
00821         __m64 d = load8888(*dest);
00822         s = pix_multiply(s, a);
00823         d = pix_add(s, d);
00824         *dest = store8888(d);
00825         ++src;
00826         ++dest;
00827         ++mask;
00828     }
00829     _mm_empty();
00830 }
00831 
00832 extern FbComposeFunctions composeFunctions;
00833 
00834 void fbComposeSetupMMX(void)
00835 {
00836     /* check if we have MMX support and initialize accordingly */
00837     if (fbHaveMMX()) {
00838         composeFunctions.combineU[PIXMAN_OPERATOR_OVER] = mmxCombineOverU;
00839         composeFunctions.combineU[PIXMAN_OPERATOR_OVER_REVERSE] = mmxCombineOverReverseU;
00840         composeFunctions.combineU[PIXMAN_OPERATOR_IN] = mmxCombineInU;
00841         composeFunctions.combineU[PIXMAN_OPERATOR_IN_REVERSE] = mmxCombineInReverseU;
00842         composeFunctions.combineU[PIXMAN_OPERATOR_OUT] = mmxCombineOutU;
00843         composeFunctions.combineU[PIXMAN_OPERATOR_OUT_REVERSE] = mmxCombineOutReverseU;
00844         composeFunctions.combineU[PIXMAN_OPERATOR_ATOP] = mmxCombineAtopU;
00845         composeFunctions.combineU[PIXMAN_OPERATOR_ATOP_REVERSE] = mmxCombineAtopReverseU;
00846         composeFunctions.combineU[PIXMAN_OPERATOR_XOR] = mmxCombineXorU;
00847         composeFunctions.combineU[PIXMAN_OPERATOR_ADD] = mmxCombineAddU;
00848         composeFunctions.combineU[PIXMAN_OPERATOR_SATURATE] = mmxCombineSaturateU;
00849 
00850         composeFunctions.combineC[PIXMAN_OPERATOR_SRC] = mmxCombineSrcC;
00851         composeFunctions.combineC[PIXMAN_OPERATOR_OVER] = mmxCombineOverC;
00852         composeFunctions.combineC[PIXMAN_OPERATOR_OVER_REVERSE] = mmxCombineOverReverseC;
00853         composeFunctions.combineC[PIXMAN_OPERATOR_IN] = mmxCombineInC;
00854         composeFunctions.combineC[PIXMAN_OPERATOR_IN_REVERSE] = mmxCombineInReverseC;
00855         composeFunctions.combineC[PIXMAN_OPERATOR_OUT] = mmxCombineOutC;
00856         composeFunctions.combineC[PIXMAN_OPERATOR_OUT_REVERSE] = mmxCombineOutReverseC;
00857         composeFunctions.combineC[PIXMAN_OPERATOR_ATOP] = mmxCombineAtopC;
00858         composeFunctions.combineC[PIXMAN_OPERATOR_ATOP_REVERSE] = mmxCombineAtopReverseC;
00859         composeFunctions.combineC[PIXMAN_OPERATOR_XOR] = mmxCombineXorC;
00860         composeFunctions.combineC[PIXMAN_OPERATOR_ADD] = mmxCombineAddC;
00861 
00862         composeFunctions.combineMaskU = mmxCombineMaskU;
00863     } 
00864 }
00865 
00866 
00867 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
00868 
00869 void
00870 fbCompositeSolid_nx8888mmx (pixman_operator_t    op,
00871                          PicturePtr pSrc,
00872                          PicturePtr pMask,
00873                          PicturePtr pDst,
00874                          INT16     xSrc,
00875                          INT16     ySrc,
00876                          INT16     xMask,
00877                          INT16     yMask,
00878                          INT16     xDst,
00879                          INT16     yDst,
00880                          CARD16    width,
00881                          CARD16    height)
00882 {
00883     CARD32    src;
00884     CARD32    *dstLine, *dst;
00885     CARD16    w;
00886     FbStride  dstStride;
00887     __m64     vsrc, vsrca;
00888     
00889     CHECKPOINT();
00890     
00891     fbComposeGetSolid(pSrc, pDst, src);
00892     
00893     if (src >> 24 == 0)
00894        return;
00895     
00896     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
00897     
00898     vsrc = load8888 (src);
00899     vsrca = expand_alpha (vsrc);
00900     
00901     while (height--)
00902     {
00903        dst = dstLine;
00904        dstLine += dstStride;
00905        w = width;
00906        
00907        CHECKPOINT();
00908        
00909        while (w && (unsigned long)dst & 7)
00910        {
00911            *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
00912            
00913            w--;
00914            dst++;
00915        }
00916        
00917        while (w >= 2)
00918        {
00919            __m64 vdest;
00920            __m64 dest0, dest1;
00921            
00922            vdest = *(__m64 *)dst;
00923            
00924            dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
00925            dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
00926            
00927            *(__m64 *)dst = pack8888(dest0, dest1);
00928            
00929            dst += 2;
00930            w -= 2;
00931        }
00932        
00933        CHECKPOINT();
00934        
00935        while (w)
00936        {
00937            *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
00938            
00939            w--;
00940            dst++;
00941        }
00942     }
00943     
00944     _mm_empty();
00945 }
00946 
00947 void
00948 fbCompositeSolid_nx0565mmx (pixman_operator_t    op,
00949                          PicturePtr pSrc,
00950                          PicturePtr pMask,
00951                          PicturePtr pDst,
00952                          INT16     xSrc,
00953                          INT16     ySrc,
00954                          INT16     xMask,
00955                          INT16     yMask,
00956                          INT16     xDst,
00957                          INT16     yDst,
00958                          CARD16    width,
00959                          CARD16    height)
00960 {
00961     CARD32    src;
00962     CARD16    *dstLine, *dst;
00963     CARD16    w;
00964     FbStride  dstStride;
00965     __m64     vsrc, vsrca;
00966     
00967     CHECKPOINT();
00968     
00969     fbComposeGetSolid(pSrc, pDst, src);
00970     
00971     if (src >> 24 == 0)
00972        return;
00973     
00974     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
00975     
00976     vsrc = load8888 (src);
00977     vsrca = expand_alpha (vsrc);
00978     
00979     while (height--)
00980     {
00981        dst = dstLine;
00982        dstLine += dstStride;
00983        w = width;
00984        
00985        CHECKPOINT();
00986        
00987        while (w && (unsigned long)dst & 7)
00988        {
00989            ullong d = *dst;
00990            __m64 vdest = expand565 ((__m64)d, 0);
00991            vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
00992            *dst = (ullong)vdest;
00993            
00994            w--;
00995            dst++;
00996        }
00997        
00998        while (w >= 4)
00999        {
01000            __m64 vdest;
01001            
01002            vdest = *(__m64 *)dst;
01003            
01004            vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
01005            vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
01006            vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
01007            vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
01008            
01009            *(__m64 *)dst = vdest;
01010            
01011            dst += 4;
01012            w -= 4;
01013        }
01014        
01015        CHECKPOINT();
01016        
01017        while (w)
01018        {
01019            ullong d = *dst;
01020            __m64 vdest = expand565 ((__m64)d, 0);
01021            vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
01022            *dst = (ullong)vdest;
01023            
01024            w--;
01025            dst++;
01026        }
01027     }
01028     
01029     _mm_empty();
01030 }
01031 
01032 void
01033 fbCompositeSolidMask_nx8888x8888Cmmx (pixman_operator_t op,
01034                                   PicturePtr pSrc,
01035                                   PicturePtr pMask,
01036                                   PicturePtr pDst,
01037                                   INT16   xSrc,
01038                                   INT16   ySrc,
01039                                   INT16   xMask,
01040                                   INT16   yMask,
01041                                   INT16   xDst,
01042                                   INT16   yDst,
01043                                   CARD16  width,
01044                                   CARD16  height)
01045 {
01046     CARD32    src, srca;
01047     CARD32    *dstLine;
01048     CARD32    *maskLine;
01049     FbStride  dstStride, maskStride;
01050     __m64     vsrc, vsrca;
01051     
01052     CHECKPOINT();
01053     
01054     fbComposeGetSolid(pSrc, pDst, src);
01055     
01056     srca = src >> 24;
01057     if (srca == 0)
01058        return;
01059     
01060     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01061     fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
01062     
01063     vsrc = load8888(src);
01064     vsrca = expand_alpha(vsrc);
01065     
01066     while (height--)
01067     {
01068        int twidth = width;
01069        CARD32 *p = (CARD32 *)maskLine;
01070        CARD32 *q = (CARD32 *)dstLine;
01071        
01072        while (twidth && (unsigned long)q & 7)
01073        {
01074            CARD32 m = *(CARD32 *)p;
01075            
01076            if (m)
01077            {
01078               __m64 vdest = load8888(*q);
01079               vdest = in_over(vsrc, vsrca, load8888(m), vdest);
01080               *q = store8888(vdest);
01081            }
01082            
01083            twidth--;
01084            p++;
01085            q++;
01086        }
01087        
01088        while (twidth >= 2)
01089        {
01090            CARD32 m0, m1;
01091            m0 = *p;
01092            m1 = *(p + 1);
01093            
01094            if (m0 | m1)
01095            {
01096               __m64 dest0, dest1;
01097               __m64 vdest = *(__m64 *)q;
01098               
01099               dest0 = in_over(vsrc, vsrca, load8888(m0),
01100                             expand8888 (vdest, 0));
01101               dest1 = in_over(vsrc, vsrca, load8888(m1),
01102                             expand8888 (vdest, 1));
01103               
01104               *(__m64 *)q = pack8888(dest0, dest1);
01105            }
01106            
01107            p += 2;
01108            q += 2;
01109            twidth -= 2;
01110        }
01111        
01112        while (twidth)
01113        {
01114            CARD32 m = *(CARD32 *)p;
01115            
01116            if (m)
01117            {
01118               __m64 vdest = load8888(*q);
01119               vdest = in_over(vsrc, vsrca, load8888(m), vdest);
01120               *q = store8888(vdest);
01121            }
01122            
01123            twidth--;
01124            p++;
01125            q++;
01126        }
01127        
01128        dstLine += dstStride;
01129        maskLine += maskStride;
01130     }
01131     
01132     _mm_empty();
01133 }
01134 
01135 void
01136 fbCompositeSrc_8888x8x8888mmx (pixman_operator_t op,
01137                             PicturePtr pSrc,
01138                             PicturePtr pMask,
01139                             PicturePtr pDst,
01140                             INT16  xSrc,
01141                             INT16  ySrc,
01142                             INT16      xMask,
01143                             INT16      yMask,
01144                             INT16      xDst,
01145                             INT16      yDst,
01146                             CARD16     width,
01147                             CARD16     height)
01148 {
01149     CARD32    *dstLine, *dst;
01150     CARD32    *srcLine, *src;
01151     CARD8     *maskLine;
01152     CARD32    mask;
01153     __m64     vmask;
01154     FbStride  dstStride, srcStride, maskStride;
01155     CARD16    w;
01156     __m64  srca;
01157     
01158     CHECKPOINT();
01159     
01160     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01161     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
01162     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
01163 
01164     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
01165     vmask = load8888 (mask);
01166     srca = MC(4x00ff);
01167     
01168     while (height--)
01169     {
01170        dst = dstLine;
01171        dstLine += dstStride;
01172        src = srcLine;
01173        srcLine += srcStride;
01174        w = width;
01175 
01176        while (w && (unsigned long)dst & 7)
01177        {
01178            __m64 s = load8888 (*src);
01179            __m64 d = load8888 (*dst);
01180            
01181            *dst = store8888 (over (s, expand_alpha (s), d));
01182            
01183            w--;
01184            dst++;
01185            src++;
01186        }
01187 
01188        while (w >= 2)
01189        {
01190            __m64 vs = *(__m64 *)dst;
01191            __m64 vd = *(__m64 *)src;
01192            __m64 vsrc0 = expand8888 (vs, 0);
01193            __m64 vsrc1 = expand8888 (vs, 1);
01194 
01195            *(__m64 *)dst = (__m64)pack8888 (
01196               in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
01197               in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
01198        
01199            w -= 2;
01200            dst += 2;
01201            src += 2;
01202        }
01203        
01204        while (w)
01205        {
01206            __m64 s = load8888 (*src);
01207            __m64 d = load8888 (*dst);
01208            
01209            *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
01210            
01211            w--;
01212            dst++;
01213            src++;
01214        }
01215     }
01216 
01217     _mm_empty(); 
01218 }
01219 
01220 void
01221 fbCompositeSrc_x888x8x8888mmx (pixman_operator_t op,
01222                             PicturePtr pSrc,
01223                             PicturePtr pMask,
01224                             PicturePtr pDst,
01225                             INT16  xSrc,
01226                             INT16  ySrc,
01227                             INT16      xMask,
01228                             INT16      yMask,
01229                             INT16      xDst,
01230                             INT16      yDst,
01231                             CARD16     width,
01232                             CARD16     height)
01233 {
01234     CARD32    *dstLine, *dst;
01235     CARD32    *srcLine, *src;
01236     CARD8     *maskLine;
01237     CARD32    mask;
01238     __m64     vmask;
01239     FbStride  dstStride, srcStride, maskStride;
01240     CARD16    w;
01241     __m64  srca;
01242     
01243     CHECKPOINT();
01244     
01245     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01246     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
01247     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
01248 
01249     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
01250     vmask = load8888 (mask);
01251     srca = MC(4x00ff);
01252     
01253     while (height--)
01254     {
01255        dst = dstLine;
01256        dstLine += dstStride;
01257        src = srcLine;
01258        srcLine += srcStride;
01259        w = width;
01260 
01261        while (w && (unsigned long)dst & 7)
01262        {
01263            __m64 s = load8888 (*src);
01264            __m64 d = load8888 (*dst);
01265            
01266            *dst = store8888 (in_over (s, srca, vmask, d));
01267            
01268            w--;
01269            dst++;
01270            src++;
01271        }
01272 
01273        while (w >= 16)
01274        {
01275            __m64 vd0 = *(__m64 *)(dst + 0);
01276            __m64 vd1 = *(__m64 *)(dst + 2);
01277            __m64 vd2 = *(__m64 *)(dst + 4);
01278            __m64 vd3 = *(__m64 *)(dst + 6);
01279            __m64 vd4 = *(__m64 *)(dst + 8);
01280            __m64 vd5 = *(__m64 *)(dst + 10);
01281            __m64 vd6 = *(__m64 *)(dst + 12);
01282            __m64 vd7 = *(__m64 *)(dst + 14);
01283 
01284            __m64 vs0 = *(__m64 *)(src + 0);
01285            __m64 vs1 = *(__m64 *)(src + 2);
01286            __m64 vs2 = *(__m64 *)(src + 4);
01287            __m64 vs3 = *(__m64 *)(src + 6);
01288            __m64 vs4 = *(__m64 *)(src + 8);
01289            __m64 vs5 = *(__m64 *)(src + 10);
01290            __m64 vs6 = *(__m64 *)(src + 12);
01291            __m64 vs7 = *(__m64 *)(src + 14);
01292 
01293            vd0 = (__m64)pack8888 (
01294               in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
01295               in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
01296        
01297            vd1 = (__m64)pack8888 (
01298               in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
01299               in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
01300        
01301            vd2 = (__m64)pack8888 (
01302               in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
01303               in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
01304        
01305            vd3 = (__m64)pack8888 (
01306               in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
01307               in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
01308        
01309            vd4 = (__m64)pack8888 (
01310               in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
01311               in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
01312        
01313            vd5 = (__m64)pack8888 (
01314               in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
01315               in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
01316        
01317            vd6 = (__m64)pack8888 (
01318               in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
01319               in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
01320        
01321            vd7 = (__m64)pack8888 (
01322               in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
01323               in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
01324        
01325            *(__m64 *)(dst + 0) = vd0;
01326            *(__m64 *)(dst + 2) = vd1;
01327            *(__m64 *)(dst + 4) = vd2;
01328            *(__m64 *)(dst + 6) = vd3;
01329            *(__m64 *)(dst + 8) = vd4;
01330            *(__m64 *)(dst + 10) = vd5;
01331            *(__m64 *)(dst + 12) = vd6;
01332            *(__m64 *)(dst + 14) = vd7;
01333 
01334            w -= 16;
01335            dst += 16;
01336            src += 16;
01337        }
01338        
01339        while (w)
01340        {
01341            __m64 s = load8888 (*src);
01342            __m64 d = load8888 (*dst);
01343            
01344            *dst = store8888 (in_over (s, srca, vmask, d));
01345            
01346            w--;
01347            dst++;
01348            src++;
01349        }
01350     }
01351 
01352     _mm_empty(); 
01353 }
01354 
01355 void
01356 fbCompositeSrc_8888x8888mmx (pixman_operator_t      op,
01357                           PicturePtr pSrc,
01358                           PicturePtr pMask,
01359                           PicturePtr pDst,
01360                           INT16      xSrc,
01361                           INT16      ySrc,
01362                           INT16      xMask,
01363                           INT16      yMask,
01364                           INT16      xDst,
01365                           INT16      yDst,
01366                           CARD16     width,
01367                           CARD16     height)
01368 {
01369     CARD32    *dstLine, *dst;
01370     CARD32    *srcLine, *src, s;
01371     FbStride  dstStride, srcStride;
01372     CARD8     a;
01373     CARD16    w;
01374 
01375     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01376     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
01377 
01378     while (height--)
01379     {
01380        dst = dstLine;
01381        dstLine += dstStride;
01382        src = srcLine;
01383        srcLine += srcStride;
01384        w = width;
01385 
01386        while (w--)
01387        {
01388            s = *src++;
01389            a = s >> 24;
01390            if (a == 0xff)
01391               *dst = s;
01392            else if (a) {
01393               __m64 ms, sa;
01394               ms = load8888(s);
01395               sa = expand_alpha(ms);
01396               *dst = store8888(over(ms, sa, load8888(*dst)));
01397            }
01398            dst++;
01399        }
01400     }
01401     _mm_empty();
01402 }
01403 
01404 
01405 void
01406 fbCompositeSolidMask_nx8x8888mmx (pixman_operator_t      op,
01407                               PicturePtr pSrc,
01408                               PicturePtr pMask,
01409                               PicturePtr pDst,
01410                               INT16      xSrc,
01411                               INT16      ySrc,
01412                               INT16      xMask,
01413                               INT16      yMask,
01414                               INT16      xDst,
01415                               INT16      yDst,
01416                               CARD16     width,
01417                               CARD16     height)
01418 {
01419     CARD32    src, srca;
01420     CARD32    *dstLine, *dst;
01421     CARD8     *maskLine, *mask;
01422     FbStride  dstStride, maskStride;
01423     CARD16    w;
01424     __m64     vsrc, vsrca;
01425     ullong    srcsrc;
01426     
01427     CHECKPOINT();
01428     
01429     fbComposeGetSolid(pSrc, pDst, src);
01430     
01431     srca = src >> 24;
01432     if (srca == 0)
01433        return;
01434     
01435     srcsrc = (unsigned long long)src << 32 | src;
01436     
01437     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01438     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
01439     
01440     vsrc = load8888 (src);
01441     vsrca = expand_alpha (vsrc);
01442     
01443     while (height--)
01444     {
01445        dst = dstLine;
01446        dstLine += dstStride;
01447        mask = maskLine;
01448        maskLine += maskStride;
01449        w = width;
01450        
01451        CHECKPOINT();
01452        
01453        while (w && (unsigned long)dst & 7)
01454        {
01455            ullong m = *mask;
01456            
01457            if (m)
01458            {
01459               __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
01460               *dst = store8888(vdest);
01461            }
01462            
01463            w--;
01464            mask++;
01465            dst++;
01466        }
01467        
01468        CHECKPOINT();
01469        
01470        while (w >= 2)
01471        {
01472            ullong m0, m1;
01473            m0 = *mask;
01474            m1 = *(mask + 1);
01475            
01476            if (srca == 0xff && (m0 & m1) == 0xff)
01477            {
01478               *(unsigned long long *)dst = srcsrc;
01479            }
01480            else if (m0 | m1)
01481            {
01482               __m64 vdest;
01483               __m64 dest0, dest1;
01484               
01485               vdest = *(__m64 *)dst;
01486               
01487               dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
01488               dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
01489               
01490               *(__m64 *)dst = pack8888(dest0, dest1);
01491            }
01492            
01493            mask += 2;
01494            dst += 2;
01495            w -= 2;
01496        }
01497        
01498        CHECKPOINT();
01499        
01500        while (w)
01501        {
01502            ullong m = *mask;
01503            
01504            if (m)
01505            {
01506               __m64 vdest = load8888(*dst);
01507               vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
01508               *dst = store8888(vdest);
01509            }
01510            
01511            w--;
01512            mask++;
01513            dst++;
01514        }
01515     }
01516     
01517     _mm_empty();
01518 }
01519 
01520 void
01521 fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_operator_t      op,
01522                                  PicturePtr pSrc,
01523                                  PicturePtr pMask,
01524                                  PicturePtr pDst,
01525                                  INT16      xSrc,
01526                                  INT16      ySrc,
01527                                  INT16      xMask,
01528                                  INT16      yMask,
01529                                  INT16      xDst,
01530                                  INT16      yDst,
01531                                  CARD16     width,
01532                                  CARD16     height)
01533 {
01534     CARD32    src, srca;
01535     CARD32    *dstLine, *dst;
01536     CARD8     *maskLine, *mask;
01537     FbStride  dstStride, maskStride;
01538     CARD16    w;
01539     __m64     vsrc, vsrca;
01540     ullong    srcsrc;
01541     
01542     CHECKPOINT();
01543     
01544     fbComposeGetSolid(pSrc, pDst, src);
01545     
01546     srca = src >> 24;
01547     if (srca == 0)
01548     {
01549        if (fbSolidFillmmx (pDst->pDrawable, xDst, yDst, width, height, 0))
01550            return;
01551     }
01552     
01553     srcsrc = (unsigned long long)src << 32 | src;
01554     
01555     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01556     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
01557     
01558     vsrc = load8888 (src);
01559     vsrca = expand_alpha (vsrc);
01560     
01561     while (height--)
01562     {
01563        dst = dstLine;
01564        dstLine += dstStride;
01565        mask = maskLine;
01566        maskLine += maskStride;
01567        w = width;
01568        
01569        CHECKPOINT();
01570        
01571        while (w && (unsigned long)dst & 7)
01572        {
01573            ullong m = *mask;
01574            
01575            if (m)
01576            {
01577                 __m64 vdest = in(vsrc, expand_alpha_rev ((__m64)m));
01578                 *dst = store8888(vdest);
01579            }
01580            else
01581            {
01582                 *dst = 0;
01583            }
01584            
01585            w--;
01586            mask++;
01587            dst++;
01588        }
01589        
01590        CHECKPOINT();
01591        
01592        while (w >= 2)
01593        {
01594            ullong m0, m1;
01595            m0 = *mask;
01596            m1 = *(mask + 1);
01597            
01598            if (srca == 0xff && (m0 & m1) == 0xff)
01599            {
01600               *(unsigned long long *)dst = srcsrc;
01601            }
01602            else if (m0 | m1)
01603            {
01604               __m64 vdest;
01605               __m64 dest0, dest1;
01606               
01607               vdest = *(__m64 *)dst;
01608               
01609               dest0 = in(vsrc, expand_alpha_rev ((__m64)m0));
01610               dest1 = in(vsrc, expand_alpha_rev ((__m64)m1));
01611               
01612               *(__m64 *)dst = pack8888(dest0, dest1);
01613            }
01614            else
01615            {
01616               *dst = 0;
01617            }
01618            
01619            mask += 2;
01620            dst += 2;
01621            w -= 2;
01622        }
01623        
01624        CHECKPOINT();
01625        
01626        while (w)
01627        {
01628            ullong m = *mask;
01629            
01630            if (m)
01631            {
01632               __m64 vdest = load8888(*dst);
01633               vdest = in(vsrc, expand_alpha_rev ((__m64)m));
01634               *dst = store8888(vdest);
01635            }
01636            else
01637            {
01638               *dst = 0;
01639            }
01640            
01641            w--;
01642            mask++;
01643            dst++;
01644        }
01645     }
01646     
01647     _mm_empty();
01648 }
01649 
01650 
01651 void
01652 fbCompositeSolidMask_nx8x0565mmx (pixman_operator_t      op,
01653                               PicturePtr pSrc,
01654                               PicturePtr pMask,
01655                               PicturePtr pDst,
01656                               INT16      xSrc,
01657                               INT16      ySrc,
01658                               INT16      xMask,
01659                               INT16      yMask,
01660                               INT16      xDst,
01661                               INT16      yDst,
01662                               CARD16     width,
01663                               CARD16     height)
01664 {
01665     CARD32    src, srca;
01666     CARD16    *dstLine, *dst;
01667     CARD8     *maskLine, *mask;
01668     FbStride  dstStride, maskStride;
01669     CARD16    w;
01670     __m64     vsrc, vsrca;
01671     unsigned long long srcsrcsrcsrc, src16;
01672     
01673     CHECKPOINT();
01674     
01675     fbComposeGetSolid(pSrc, pDst, src);
01676     
01677     srca = src >> 24;
01678     if (srca == 0)
01679        return;
01680     
01681     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
01682     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
01683     
01684     vsrc = load8888 (src);
01685     vsrca = expand_alpha (vsrc);
01686     
01687     src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
01688     
01689     srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
01690        (ullong)src16 << 16 | (ullong)src16;
01691     
01692     while (height--)
01693     {
01694        dst = dstLine;
01695        dstLine += dstStride;
01696        mask = maskLine;
01697        maskLine += maskStride;
01698        w = width;
01699        
01700        CHECKPOINT();
01701        
01702        while (w && (unsigned long)dst & 7)
01703        {
01704            ullong m = *mask;
01705            
01706            if (m)
01707            {
01708               ullong d = *dst;
01709               __m64 vd = (__m64)d;
01710               __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
01711               *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
01712            }
01713            
01714            w--;
01715            mask++;
01716            dst++;
01717        }
01718        
01719        CHECKPOINT();
01720        
01721        while (w >= 4)
01722        {
01723            ullong m0, m1, m2, m3;
01724            m0 = *mask;
01725            m1 = *(mask + 1);
01726            m2 = *(mask + 2);
01727            m3 = *(mask + 3);
01728            
01729            if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
01730            {
01731               *(unsigned long long *)dst = srcsrcsrcsrc;
01732            }
01733            else if (m0 | m1 | m2 | m3)
01734            {
01735               __m64 vdest;
01736               __m64 vm0, vm1, vm2, vm3;
01737               
01738               vdest = *(__m64 *)dst;
01739               
01740               vm0 = (__m64)m0;
01741               vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
01742               vm1 = (__m64)m1;
01743               vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
01744               vm2 = (__m64)m2;
01745               vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
01746               vm3 = (__m64)m3;
01747               vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
01748               
01749               *(__m64 *)dst = vdest;
01750            }
01751            
01752            w -= 4;
01753            mask += 4;
01754            dst += 4;
01755        }
01756        
01757        CHECKPOINT();
01758        
01759        while (w)
01760        {
01761            ullong m = *mask;
01762            
01763            if (m)
01764            {
01765               ullong d = *dst;
01766               __m64 vd = (__m64)d;
01767               __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
01768               *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
01769            }
01770            
01771            w--;
01772            mask++;
01773            dst++;
01774        }
01775     }
01776     
01777     _mm_empty();
01778 }
01779 
01780 void
01781 fbCompositeSrc_8888RevNPx0565mmx (pixman_operator_t      op,
01782                               PicturePtr pSrc,
01783                               PicturePtr pMask,
01784                               PicturePtr pDst,
01785                               INT16      xSrc,
01786                               INT16      ySrc,
01787                               INT16      xMask,
01788                               INT16      yMask,
01789                               INT16      xDst,
01790                               INT16      yDst,
01791                               CARD16     width,
01792                               CARD16     height)
01793 {
01794     CARD16    *dstLine, *dst;
01795     CARD32    *srcLine, *src;
01796     FbStride  dstStride, srcStride;
01797     CARD16    w;
01798     
01799     CHECKPOINT();
01800     
01801     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
01802     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
01803     
01804     assert (pSrc->pDrawable == pMask->pDrawable);
01805     
01806     while (height--)
01807     {
01808        dst = dstLine;
01809        dstLine += dstStride;
01810        src = srcLine;
01811        srcLine += srcStride;
01812        w = width;
01813        
01814        CHECKPOINT();
01815        
01816        while (w && (unsigned long)dst & 7)
01817        {
01818            __m64 vsrc = load8888 (*src);
01819            ullong d = *dst;
01820            __m64 vdest = expand565 ((__m64)d, 0);
01821            
01822            vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
01823            
01824            *dst = (ullong)vdest;
01825            
01826            w--;
01827            dst++;
01828            src++;
01829        }
01830        
01831        CHECKPOINT();
01832        
01833        while (w >= 4)
01834        {
01835            CARD32 s0, s1, s2, s3;
01836            unsigned char a0, a1, a2, a3;
01837            
01838            s0 = *src;
01839            s1 = *(src + 1);
01840            s2 = *(src + 2);
01841            s3 = *(src + 3);
01842            
01843            a0 = (s0 >> 24);
01844            a1 = (s1 >> 24);
01845            a2 = (s2 >> 24);
01846            a3 = (s3 >> 24);
01847            
01848            if ((a0 & a1 & a2 & a3) == 0xFF)
01849            {
01850               __m64 vdest;
01851               vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
01852               vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
01853               vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
01854               vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
01855               
01856               *(__m64 *)dst = vdest;
01857            }
01858            else if (a0 | a1 | a2 | a3)
01859            {
01860               __m64 vdest = *(__m64 *)dst;
01861               
01862               vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
01863                vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
01864               vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
01865               vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
01866               
01867               *(__m64 *)dst = vdest;
01868            }
01869            
01870            w -= 4;
01871            dst += 4;
01872            src += 4;
01873        }
01874        
01875        CHECKPOINT();
01876        
01877        while (w)
01878        {
01879            __m64 vsrc = load8888 (*src);
01880            ullong d = *dst;
01881            __m64 vdest = expand565 ((__m64)d, 0);
01882            
01883            vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
01884            
01885            *dst = (ullong)vdest;
01886            
01887            w--;
01888            dst++;
01889            src++;
01890        }
01891     }
01892     
01893     _mm_empty();
01894 }
01895 
01896 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
01897 
01898 void
01899 fbCompositeSrc_8888RevNPx8888mmx (pixman_operator_t      op,
01900                               PicturePtr pSrc,
01901                               PicturePtr pMask,
01902                               PicturePtr pDst,
01903                               INT16      xSrc,
01904                               INT16      ySrc,
01905                               INT16      xMask,
01906                               INT16      yMask,
01907                               INT16      xDst,
01908                               INT16      yDst,
01909                               CARD16     width,
01910                               CARD16     height)
01911 {
01912     CARD32    *dstLine, *dst;
01913     CARD32    *srcLine, *src;
01914     FbStride  dstStride, srcStride;
01915     CARD16    w;
01916     
01917     CHECKPOINT();
01918     
01919     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
01920     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
01921     
01922     assert (pSrc->pDrawable == pMask->pDrawable);
01923     
01924     while (height--)
01925     {
01926        dst = dstLine;
01927        dstLine += dstStride;
01928        src = srcLine;
01929        srcLine += srcStride;
01930        w = width;
01931        
01932        while (w && (unsigned long)dst & 7)
01933        {
01934            __m64 s = load8888 (*src);
01935            __m64 d = load8888 (*dst);
01936            
01937            *dst = store8888 (over_rev_non_pre (s, d));
01938            
01939            w--;
01940            dst++;
01941            src++;
01942        }
01943        
01944        while (w >= 2)
01945        {
01946            ullong s0, s1;
01947            unsigned char a0, a1;
01948            __m64 d0, d1;
01949            
01950            s0 = *src;
01951            s1 = *(src + 1);
01952            
01953            a0 = (s0 >> 24);
01954            a1 = (s1 >> 24);
01955            
01956            if ((a0 & a1) == 0xFF)
01957            {
01958               d0 = invert_colors(load8888(s0));
01959               d1 = invert_colors(load8888(s1));
01960               
01961               *(__m64 *)dst = pack8888 (d0, d1);
01962            }
01963            else if (a0 | a1)
01964            {
01965               __m64 vdest = *(__m64 *)dst;
01966               
01967               d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
01968               d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
01969               
01970               *(__m64 *)dst = pack8888 (d0, d1);
01971            }
01972            
01973            w -= 2;
01974            dst += 2;
01975            src += 2;
01976        }
01977        
01978        while (w)
01979        {
01980            __m64 s = load8888 (*src);
01981            __m64 d = load8888 (*dst);
01982            
01983            *dst = store8888 (over_rev_non_pre (s, d));
01984            
01985            w--;
01986            dst++;
01987            src++;
01988        }
01989     }
01990     
01991     _mm_empty();
01992 }
01993 
01994 void
01995 fbCompositeSolidMask_nx8888x0565Cmmx (pixman_operator_t      op,
01996                                   PicturePtr pSrc,
01997                                   PicturePtr pMask,
01998                                   PicturePtr pDst,
01999                                   INT16      xSrc,
02000                                   INT16      ySrc,
02001                                   INT16      xMask,
02002                                   INT16      yMask,
02003                                   INT16      xDst,
02004                                   INT16      yDst,
02005                                   CARD16     width,
02006                                   CARD16     height)
02007 {
02008     CARD32    src, srca;
02009     CARD16    *dstLine;
02010     CARD32    *maskLine;
02011     FbStride  dstStride, maskStride;
02012     __m64  vsrc, vsrca;
02013     
02014     CHECKPOINT();
02015     
02016     fbComposeGetSolid(pSrc, pDst, src);
02017     
02018     srca = src >> 24;
02019     if (srca == 0)
02020        return;
02021     
02022     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
02023     fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
02024     
02025     vsrc = load8888 (src);
02026     vsrca = expand_alpha (vsrc);
02027     
02028     while (height--)
02029     {
02030        int twidth = width;
02031        CARD32 *p = (CARD32 *)maskLine;
02032        CARD16 *q = (CARD16 *)dstLine;
02033        
02034        while (twidth && ((unsigned long)q & 7))
02035        {
02036            CARD32 m = *(CARD32 *)p;
02037            
02038            if (m)
02039            {
02040               ullong d = *q;
02041               __m64 vdest = expand565 ((__m64)d, 0);
02042               vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
02043               *q = (ullong)vdest;
02044            }
02045            
02046            twidth--;
02047            p++;
02048            q++;
02049        }
02050        
02051        while (twidth >= 4)
02052        {
02053            CARD32 m0, m1, m2, m3;
02054            
02055            m0 = *p;
02056            m1 = *(p + 1);
02057            m2 = *(p + 2);
02058            m3 = *(p + 3);
02059            
02060            if ((m0 | m1 | m2 | m3))
02061            {
02062               __m64 vdest = *(__m64 *)q;
02063               
02064               vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
02065               vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
02066               vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
02067               vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
02068               
02069               *(__m64 *)q = vdest;
02070            }
02071            twidth -= 4;
02072            p += 4;
02073            q += 4;
02074        }
02075        
02076        while (twidth)
02077        {
02078            CARD32 m;
02079            
02080            m = *(CARD32 *)p;
02081            if (m)
02082            {
02083               ullong d = *q;
02084               __m64 vdest = expand565((__m64)d, 0);
02085               vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
02086               *q = (ullong)vdest;
02087            }
02088            
02089            twidth--;
02090            p++;
02091            q++;
02092        }
02093        
02094        maskLine += maskStride;
02095        dstLine += dstStride;
02096     }
02097     
02098     _mm_empty ();
02099 }
02100 
02101 void
02102 fbCompositeSrcAdd_8000x8000mmx (pixman_operator_t       op,
02103                             PicturePtr pSrc,
02104                             PicturePtr pMask,
02105                             PicturePtr pDst,
02106                             INT16      xSrc,
02107                             INT16      ySrc,
02108                             INT16      xMask,
02109                             INT16      yMask,
02110                             INT16      xDst,
02111                             INT16      yDst,
02112                             CARD16     width,
02113                             CARD16     height)
02114 {
02115     CARD8     *dstLine, *dst;
02116     CARD8     *srcLine, *src;
02117     FbStride  dstStride, srcStride;
02118     CARD16    w;
02119     CARD8     s, d;
02120     CARD16    t;
02121     
02122     CHECKPOINT();
02123     
02124     fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
02125     fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
02126     
02127     while (height--)
02128     {
02129        dst = dstLine;
02130        dstLine += dstStride;
02131        src = srcLine;
02132        srcLine += srcStride;
02133        w = width;
02134        
02135        while (w && (unsigned long)dst & 7)
02136        {
02137            s = *src;
02138            d = *dst;
02139            t = d + s;
02140            s = t | (0 - (t >> 8));
02141            *dst = s;
02142            
02143            dst++;
02144            src++;
02145            w--;
02146        }
02147        
02148        while (w >= 8)
02149        {
02150            *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
02151            dst += 8;
02152            src += 8;
02153            w -= 8;
02154        }
02155        
02156        while (w)
02157        {
02158            s = *src;
02159            d = *dst;
02160            t = d + s;
02161            s = t | (0 - (t >> 8));
02162            *dst = s;
02163            
02164            dst++;
02165            src++;
02166            w--;
02167        }
02168     }
02169     
02170     _mm_empty();
02171 }
02172 
02173 void
02174 fbCompositeSrcAdd_8888x8888mmx (pixman_operator_t              op,
02175                             PicturePtr    pSrc,
02176                             PicturePtr    pMask,
02177                             PicturePtr     pDst,
02178                             INT16          xSrc,
02179                             INT16      ySrc,
02180                             INT16      xMask,
02181                             INT16      yMask,
02182                             INT16      xDst,
02183                             INT16      yDst,
02184                             CARD16     width,
02185                             CARD16     height)
02186 {
02187     CARD32    *dstLine, *dst;
02188     CARD32    *srcLine, *src;
02189     FbStride  dstStride, srcStride;
02190     CARD16    w;
02191     
02192     CHECKPOINT();
02193     
02194     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
02195     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
02196     
02197     while (height--)
02198     {
02199        dst = dstLine;
02200        dstLine += dstStride;
02201        src = srcLine;
02202        srcLine += srcStride;
02203        w = width;
02204        
02205        while (w && (unsigned long)dst & 7)
02206        {
02207            *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
02208                                            _mm_cvtsi32_si64(*dst)));
02209            dst++;
02210            src++;
02211            w--;
02212        }
02213        
02214        while (w >= 2)
02215        {
02216            *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
02217            dst += 2;
02218            src += 2;
02219            w -= 2;
02220        }
02221        
02222        if (w)
02223        {
02224            *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
02225                                            _mm_cvtsi32_si64(*dst)));
02226            
02227        }
02228     }
02229     
02230     _mm_empty();
02231 }
02232 
02233 Bool
02234 fbSolidFillmmx (FbPixels    *pDraw,
02235               int           x,
02236               int           y,
02237               int           width,
02238               int           height,
02239               FbBits        xor)
02240 { 
02241     FbStride  stride;
02242     int              bpp;
02243     ullong    fill;
02244     __m64     vfill;
02245     CARD32    byte_width;
02246     CARD8     *byte_line;
02247     FbBits      *bits;
02248     int              xoff, yoff;
02249     
02250     CHECKPOINT();
02251     
02252     fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
02253     
02254     if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
02255        return FALSE;
02256     
02257     if (bpp != 16 && bpp != 32)
02258        return FALSE;
02259     
02260     if (bpp == 16)
02261     {
02262        stride = stride * sizeof (FbBits) / 2;
02263        byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff));
02264        byte_width = 2 * width;
02265        stride *= 2;
02266     }
02267     else
02268     {
02269        stride = stride * sizeof (FbBits) / 4;
02270        byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff));
02271        byte_width = 4 * width;
02272        stride *= 4;
02273     }
02274     
02275     fill = ((ullong)xor << 32) | xor;
02276     vfill = (__m64)fill;
02277     
02278     while (height--)
02279     {
02280        int w;
02281        CARD8 *d = byte_line;
02282        byte_line += stride;
02283        w = byte_width;
02284        
02285        while (w >= 2 && ((unsigned long)d & 3))
02286        {
02287            *(CARD16 *)d = xor;
02288            w -= 2;
02289            d += 2;
02290        }
02291        
02292        while (w >= 4 && ((unsigned long)d & 7))
02293        {
02294            *(CARD32 *)d = xor;
02295            
02296            w -= 4;
02297            d += 4;
02298        }
02299        
02300        while (w >= 64)
02301        {
02302            *(__m64*) (d +  0) = vfill;
02303            *(__m64*) (d +  8) = vfill;
02304            *(__m64*) (d + 16) = vfill;
02305            *(__m64*) (d + 24) = vfill;
02306            *(__m64*) (d + 32) = vfill;
02307            *(__m64*) (d + 40) = vfill;
02308            *(__m64*) (d + 48) = vfill;
02309            *(__m64*) (d + 56) = vfill;
02310            
02311            w -= 64;
02312            d += 64;
02313        }
02314        while (w >= 4)
02315        {
02316            *(CARD32 *)d = xor;
02317            
02318            w -= 4;
02319            d += 4;
02320        }
02321        if (w >= 2)
02322        {
02323            *(CARD16 *)d = xor;
02324            w -= 2;
02325            d += 2;
02326        }
02327     }
02328     
02329     _mm_empty();
02330     return TRUE;
02331 }
02332 
02333 Bool
02334 fbCopyAreammx (FbPixels     *pSrc,
02335               FbPixels      *pDst,
02336               int           src_x,
02337               int           src_y,
02338               int           dst_x,
02339               int           dst_y,
02340               int           width,
02341               int           height)
02342 {
02343     FbBits *  src_bits;
02344     FbStride  src_stride;
02345     int              src_bpp;
02346     int              src_xoff;
02347     int              src_yoff;
02348 
02349     FbBits *  dst_bits;
02350     FbStride  dst_stride;
02351     int              dst_bpp;
02352     int              dst_xoff;
02353     int              dst_yoff;
02354 
02355     CARD8 *   src_bytes;
02356     CARD8 *   dst_bytes;
02357     int              byte_width;
02358 
02359     fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
02360     fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
02361 
02362     if (src_bpp != 16 && src_bpp != 32)
02363        return FALSE;
02364 
02365     if (dst_bpp != 16 && dst_bpp != 32)
02366        return FALSE;
02367 
02368     if (src_bpp != dst_bpp)
02369     {
02370        return FALSE;
02371     }
02372     
02373     if (src_bpp == 16)
02374     {
02375        src_stride = src_stride * sizeof (FbBits) / 2;
02376        dst_stride = dst_stride * sizeof (FbBits) / 2;
02377        src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y + src_yoff) + (src_x + src_xoff));
02378        dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y + dst_yoff) + (dst_x + dst_xoff));
02379        byte_width = 2 * width;
02380        src_stride *= 2;
02381        dst_stride *= 2;
02382     }
02383     else
02384     {
02385        src_stride = src_stride * sizeof (FbBits) / 4;
02386        dst_stride = dst_stride * sizeof (FbBits) / 4;
02387        src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y + src_yoff) + (src_x + src_xoff));
02388        dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y + dst_yoff) + (dst_x + dst_xoff));
02389        byte_width = 4 * width;
02390        src_stride *= 4;
02391        dst_stride *= 4;
02392     }
02393 
02394     while (height--)
02395     {
02396        int w;
02397        CARD8 *s = src_bytes;
02398        CARD8 *d = dst_bytes;
02399        src_bytes += src_stride;
02400        dst_bytes += dst_stride;
02401        w = byte_width;
02402        
02403        while (w >= 2 && ((unsigned long)d & 3))
02404        {
02405            *(CARD16 *)d = *(CARD16 *)s;
02406            w -= 2;
02407            s += 2;
02408            d += 2;
02409        }
02410        
02411        while (w >= 4 && ((unsigned int)d & 7))
02412        {
02413            *(CARD32 *)d = *(CARD32 *)s;
02414            
02415            w -= 4;
02416            s += 4;
02417            d += 4;
02418        }
02419        
02420        while (w >= 64)
02421        {
02422            *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
02423            *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
02424            *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
02425            *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
02426            *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
02427            *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
02428            *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
02429            *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
02430            w -= 64;
02431            s += 64;
02432            d += 64;
02433        }
02434        while (w >= 4)
02435        {
02436            *(CARD32 *)d = *(CARD32 *)s;
02437 
02438            w -= 4;
02439            s += 4;
02440            d += 4;
02441        }
02442        if (w >= 2)
02443        {
02444            *(CARD16 *)d = *(CARD16 *)s;
02445            w -= 2;
02446            s += 2;
02447            d += 2;
02448        }
02449     }
02450     
02451     _mm_empty();
02452     return TRUE;
02453 }
02454 
02455 void
02456 fbCompositeCopyAreammx (pixman_operator_t        op,
02457                      PicturePtr    pSrc,
02458                      PicturePtr    pMask,
02459                      PicturePtr    pDst,
02460                      INT16         xSrc,
02461                      INT16         ySrc,
02462                      INT16         xMask,
02463                      INT16         yMask,
02464                      INT16         xDst,
02465                      INT16         yDst,
02466                      CARD16        width,
02467                      CARD16        height)
02468 {
02469     fbCopyAreammx (pSrc->pDrawable,
02470                  pDst->pDrawable,
02471                  xSrc, ySrc,
02472                  xDst, yDst,
02473                  width, height);
02474 }
02475 
02476 #endif /* RENDER */