os/ossrv/genericopenlibs/liboil/src/fb/fbmmx.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2  * Copyright © 2004 Red Hat, Inc.
     3  * Copyright © 2004 Nicholas Miell
     4  * Copyright © 2005 Trolltech AS
     5  *
     6  * Permission to use, copy, modify, distribute, and sell this software and its
     7  * documentation for any purpose is hereby granted without fee, provided that
     8  * the above copyright notice appear in all copies and that both that
     9  * copyright notice and this permission notice appear in supporting
    10  * documentation, and that the name of Red Hat not be used in advertising or
    11  * publicity pertaining to distribution of the software without specific,
    12  * written prior permission.  Red Hat makes no representations about the
    13  * suitability of this software for any purpose.  It is provided "as is"
    14  * without express or implied warranty.
    15  *
    16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    23  * SOFTWARE.
    24  *
    25  * Author:  Søren Sandmann (sandmann@redhat.com)
    26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
    27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 
    28  *
    29  * Based on work by Owen Taylor
    30  */
    31 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
    32 
    33 #ifdef HAVE_CONFIG_H
    34 #include "config.h"
    35 #endif
    36 
    37 #include <liboil/liboil.h>
    38 #include <liboil/liboilfunction.h>
    39  
    40 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
    41 
    42 typedef uint32_t CARD32;
    43 typedef uint16_t CARD16;
    44 typedef int16_t INT16;
    45 typedef uint8_t CARD8;
    46 typedef uint64_t ullong;
    47 typedef CARD32* PicturePtr;
    48 typedef CARD32* FbBits;
    49 typedef int FbStride;
    50 
    51 
    52 #include "fbmmx.h"
    53 #include "fbpict.h"
    54 
    55 #define CHECKPOINT()
    56 
    57 OIL_DECLARE_CLASS (composite_in_argb);
    58 OIL_DECLARE_CLASS (composite_in_argb_const_src);
    59 OIL_DECLARE_CLASS (composite_in_argb_const_mask);
    60 OIL_DECLARE_CLASS (composite_over_argb);
    61 OIL_DECLARE_CLASS (composite_over_argb_const_src);
    62 OIL_DECLARE_CLASS (composite_add_argb);
    63 OIL_DECLARE_CLASS (composite_add_argb_const_src);
    64 OIL_DECLARE_CLASS (composite_in_over_argb);
    65 OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
    66 OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
    67 OIL_DECLARE_CLASS (composite_over_u8);
    68 OIL_DECLARE_CLASS (composite_add_u8);
    69 
    70 
    71 /* --------------- MMX code patch for fbcompose.c --------------------- */
    72 
    73 #if 0
    74 static void
    75 mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
    76 {
    77     const __m64 mmx_0 = _mm_setzero_si64();
    78     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
    79     
    80     const uint32_t *end = mask + width;
    81     while (mask < end) {
    82         __m64 a = MmxTo(*mask);
    83         __m64 s = MmxTo(*src);
    84         a = MmxAlpha(a);
    85         MmxMul(s, a);
    86         *dest = MmxFrom(s);
    87         ++src;
    88         ++dest;
    89         ++mask;
    90     }
    91     _mm_empty();
    92 }
    93 #endif
    94 
    95 #ifdef ENABLE_BROKEN_IMPLS
    96 static void
    97 mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
    98 {
    99     const __m64 mmx_0 = _mm_setzero_si64();
   100     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   101     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   102 
   103     const uint32_t *end = dest + width;
   104 
   105     while (dest < end) {
   106         __m64 x, y, a;
   107         x = MmxTo(*src);
   108         y = MmxTo(*dest);
   109         a = MmxAlpha(x);
   110         a = MmxNegate(a);
   111         MmxMulAdd(y, a, x);
   112         *dest = MmxFrom(y);
   113         ++dest;
   114         ++src;
   115     }
   116     _mm_empty();
   117 }
   118 OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
   119 #endif
   120 
   121 #if 0
   122 static FASTCALL void
   123 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
   124 {
   125     const __m64 mmx_0 = _mm_setzero_si64();
   126     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   127     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   128 
   129     const CARD32 *end = dest + width;
   130 
   131     while (dest < end) {
   132         __m64 x, y, a;
   133         x = MmxTo(*dest);
   134         y = MmxTo(*src);
   135         a = MmxAlpha(x);
   136         a = MmxNegate(a);
   137         MmxMulAdd(y, a, x);
   138         *dest = MmxFrom(y);
   139         ++dest;
   140         ++src;
   141     }
   142     _mm_empty();
   143 }
   144 #endif
   145 
   146 #if 0
   147 static void
   148 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
   149 {
   150     const __m64 mmx_0 = _mm_setzero_si64();
   151     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   152 
   153     const CARD32 *end = dest + width;
   154 
   155     while (dest < end) {
   156         __m64 x, a;
   157         x = MmxTo(*src);
   158         a = MmxTo(*dest);
   159         a = MmxAlpha(a);
   160         MmxMul(x, a);
   161         *dest = MmxFrom(x);
   162         ++dest;
   163         ++src;
   164     }
   165     _mm_empty();
   166 }
   167 #endif
   168 
   169 #if 0
   170 static FASTCALL void
   171 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
   172 {
   173     const __m64 mmx_0 = _mm_setzero_si64();
   174     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   175 
   176     const CARD32 *end = dest + width;
   177 
   178     while (dest < end) {
   179         __m64 x, a;
   180         x = MmxTo(*dest);
   181         a = MmxTo(*src);
   182         a = MmxAlpha(a);
   183         MmxMul(x, a);
   184         *dest = MmxFrom(x);
   185         ++dest;
   186         ++src;
   187     }
   188     _mm_empty();
   189 }
   190 #endif
   191 
   192 #if 0
   193 static FASTCALL void
   194 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
   195 {
   196     const __m64 mmx_0 = _mm_setzero_si64();
   197     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   198     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   199 
   200     const CARD32 *end = dest + width;
   201 
   202     while (dest < end) {
   203         __m64 x, a;
   204         x = MmxTo(*src);
   205         a = MmxTo(*dest);
   206         a = MmxAlpha(a);
   207         a = MmxNegate(a);
   208         MmxMul(x, a);
   209         *dest = MmxFrom(x);
   210         ++dest;
   211         ++src;
   212     }
   213     _mm_empty();
   214 }
   215 #endif
   216 
   217 #if 0
   218 static FASTCALL void
   219 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
   220 {
   221     const __m64 mmx_0 = _mm_setzero_si64();
   222     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   223     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   224 
   225     const CARD32 *end = dest + width;
   226 
   227     while (dest < end) {
   228         __m64 x, a;
   229         x = MmxTo(*dest);
   230         a = MmxTo(*src);
   231         a = MmxAlpha(a);
   232         a = MmxNegate(a);
   233         MmxMul(x, a);
   234         *dest = MmxFrom(x);
   235         ++dest;
   236         ++src;
   237     }
   238     _mm_empty();
   239 }
   240 
   241 static FASTCALL void
   242 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
   243 {
   244     const __m64 mmx_0 = _mm_setzero_si64();
   245     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   246     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   247 
   248     const CARD32 *end = dest + width;
   249 
   250     while (dest < end) {
   251         __m64 s, da, d, sia;
   252         s = MmxTo(*src);
   253         d = MmxTo(*dest);
   254         sia = MmxAlpha(s);
   255         sia = MmxNegate(sia);
   256         da = MmxAlpha(d);
   257         MmxAddMul(s, da, d, sia);
   258         *dest = MmxFrom(s);
   259         ++dest;
   260         ++src;
   261     }
   262     _mm_empty();
   263 }
   264 
   265 static FASTCALL void
   266 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
   267 {
   268     const __m64 mmx_0 = _mm_setzero_si64();
   269     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   270     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   271 
   272     const CARD32 *end;
   273 
   274     end = dest + width;
   275 
   276     while (dest < end) {
   277         __m64 s, dia, d, sa;
   278         s = MmxTo(*src);
   279         d = MmxTo(*dest);
   280         sa = MmxAlpha(s);
   281         dia = MmxAlpha(d);
   282         dia = MmxNegate(dia);
   283         MmxAddMul(s, dia, d, sa);
   284         *dest = MmxFrom(s);
   285         ++dest;
   286         ++src;
   287     }
   288     _mm_empty();
   289 }
   290 
   291 static FASTCALL void
   292 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
   293 {
   294     const __m64 mmx_0 = _mm_setzero_si64();
   295     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   296     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   297 
   298     const CARD32 *end = dest + width;
   299 
   300     while (dest < end) {
   301         __m64 s, dia, d, sia;
   302         s = MmxTo(*src);
   303         d = MmxTo(*dest);
   304         sia = MmxAlpha(s);
   305         dia = MmxAlpha(d);
   306         sia = MmxNegate(sia);
   307         dia = MmxNegate(dia);
   308         MmxAddMul(s, dia, d, sia);
   309         *dest = MmxFrom(s);
   310         ++dest;
   311         ++src;
   312     }
   313     _mm_empty();
   314 }
   315 #endif
   316 
   317 static void
   318 mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
   319 {
   320     const __m64 mmx_0 = _mm_setzero_si64();
   321 
   322     const uint32_t *end = dest + width;
   323     while (dest < end) {
   324         __m64 s, d;
   325         s = MmxTo(*src);
   326         d = MmxTo(*dest);
   327         s = MmxAdd(s, d);
   328         *dest = MmxFrom(s);
   329         ++dest;
   330         ++src;
   331     }
   332     _mm_empty();
   333 }
   334 OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
   335 
   336 #if 0
   337 static FASTCALL void
   338 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
   339 {
   340     const __m64 mmx_0 = _mm_setzero_si64();
   341     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   342 
   343     const CARD32 *end = dest + width;
   344     while (dest < end) {
   345         CARD32 s = *src;
   346         CARD32 d = *dest;
   347         __m64 ms = MmxTo(s);
   348         __m64 md = MmxTo(d);
   349         CARD32 sa = s >> 24;
   350         CARD32 da = ~d >> 24;
   351 
   352         if (sa > da) {
   353             __m64 msa = MmxTo(FbIntDiv(da, sa));
   354             msa = MmxAlpha(msa);
   355             MmxMul(ms, msa);
   356         }
   357         MmxAdd(md, ms);
   358         *dest = MmxFrom(md);
   359         ++src;
   360         ++dest;
   361     }
   362     _mm_empty();
   363 }
   364 
   365 
   366 static FASTCALL void
   367 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   368 {
   369     const __m64 mmx_0 = _mm_setzero_si64();
   370     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   371 
   372     const CARD32 *end = src + width;
   373     while (src < end) {
   374         __m64 a = MmxTo(*mask);
   375         __m64 s = MmxTo(*src);
   376         MmxMul(s, a);
   377         *dest = MmxFrom(s);
   378         ++src;
   379         ++mask;
   380         ++dest;
   381     }
   382     _mm_empty();
   383 }
   384 
   385 static FASTCALL void
   386 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   387 {
   388     const __m64 mmx_0 = _mm_setzero_si64();
   389     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   390     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   391     
   392     const CARD32 *end = src + width;
   393     while (src < end) {
   394         __m64 a = MmxTo(*mask);
   395         __m64 s = MmxTo(*src);
   396         __m64 d = MmxTo(*dest);
   397         __m64 sa = MmxAlpha(s);
   398         MmxMul(s, a);
   399         MmxMul(a, sa);
   400         a = MmxNegate(a);
   401         MmxMulAdd(d, a, s);
   402         *dest = MmxFrom(d);
   403         ++src;
   404         ++dest;
   405         ++mask;
   406     }
   407     _mm_empty();
   408 }
   409 
   410 static FASTCALL void
   411 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   412 {
   413     const __m64 mmx_0 = _mm_setzero_si64();
   414     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   415     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   416     
   417     const CARD32 *end = src + width;
   418     while (src < end) {
   419         __m64 a = MmxTo(*mask);
   420         __m64 s = MmxTo(*src);
   421         __m64 d = MmxTo(*dest);
   422         __m64 da = MmxAlpha(d);
   423         da = MmxNegate(da);
   424         MmxMul(s, a);
   425         MmxMulAdd(s, da, d);
   426         *dest = MmxFrom(s);
   427         ++src;
   428         ++dest;
   429         ++mask;
   430     }
   431     _mm_empty();
   432 }
   433 
   434 
   435 static FASTCALL void
   436 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   437 {
   438     const __m64 mmx_0 = _mm_setzero_si64();
   439     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   440     
   441     const CARD32 *end = src + width;
   442     while (src < end) {
   443         __m64 a = MmxTo(*mask);
   444         __m64 s = MmxTo(*src);
   445         __m64 d = MmxTo(*dest);
   446         __m64 da = MmxAlpha(d);
   447         MmxMul(s, a);
   448         MmxMul(s, da);
   449         *dest = MmxFrom(s);
   450         ++src;
   451         ++dest;
   452         ++mask;
   453     }
   454     _mm_empty();
   455 }
   456 
   457 static FASTCALL void
   458 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   459 {
   460     const __m64 mmx_0 = _mm_setzero_si64();
   461     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   462     
   463     const CARD32 *end = src + width;
   464     while (src < end) {
   465         __m64 a = MmxTo(*mask);
   466         __m64 s = MmxTo(*src);
   467         __m64 d = MmxTo(*dest);
   468         __m64 sa = MmxAlpha(s);
   469         MmxMul(a, sa);
   470         MmxMul(d, a);
   471         *dest = MmxFrom(d);
   472         ++src;
   473         ++dest;
   474         ++mask;
   475     }
   476     _mm_empty();
   477 }
   478 
   479 static FASTCALL void
   480 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   481 {
   482     const __m64 mmx_0 = _mm_setzero_si64();
   483     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   484     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   485     
   486     const CARD32 *end = src + width;
   487     while (src < end) {
   488         __m64 a = MmxTo(*mask);
   489         __m64 s = MmxTo(*src);
   490         __m64 d = MmxTo(*dest);
   491         __m64 da = MmxAlpha(d);
   492         da = MmxNegate(da);
   493         MmxMul(s, a);
   494         MmxMul(s, da);
   495         *dest = MmxFrom(s);
   496         ++src;
   497         ++dest;
   498         ++mask;
   499     }
   500     _mm_empty();
   501 }
   502 
   503 static FASTCALL void
   504 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   505 {
   506     const __m64 mmx_0 = _mm_setzero_si64();
   507     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   508     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   509     
   510     const CARD32 *end = src + width;
   511     while (src < end) {
   512         __m64 a = MmxTo(*mask);
   513         __m64 s = MmxTo(*src);
   514         __m64 d = MmxTo(*dest);
   515         __m64 sa = MmxAlpha(s);
   516         MmxMul(a, sa);
   517         a = MmxNegate(a);
   518         MmxMul(d, a);
   519         *dest = MmxFrom(d);
   520         ++src;
   521         ++dest;
   522         ++mask;
   523     }
   524     _mm_empty();
   525 }
   526 
   527 static FASTCALL void
   528 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   529 {
   530     const __m64 mmx_0 = _mm_setzero_si64();
   531     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   532     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   533     
   534     const CARD32 *end = src + width;
   535     while (src < end) {
   536         __m64 a = MmxTo(*mask);
   537         __m64 s = MmxTo(*src);
   538         __m64 d = MmxTo(*dest);
   539         __m64 da = MmxAlpha(d);
   540         __m64 sa = MmxAlpha(s); 
   541         MmxMul(s, a);
   542         MmxMul(a, sa);
   543         a = MmxNegate(a);
   544         MmxAddMul(d, a, s, da);
   545         *dest = MmxFrom(d);
   546         ++src;
   547         ++dest;
   548         ++mask;
   549     }
   550     _mm_empty();
   551 }
   552 
   553 static FASTCALL void
   554 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   555 {
   556     const __m64 mmx_0 = _mm_setzero_si64();
   557     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   558     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   559     
   560     const CARD32 *end = src + width;
   561     while (src < end) {
   562         __m64 a = MmxTo(*mask);
   563         __m64 s = MmxTo(*src);
   564         __m64 d = MmxTo(*dest);
   565         __m64 da = MmxAlpha(d);
   566         __m64 sa = MmxAlpha(s)
   567         MmxMul(s, a);
   568         MmxMul(a, sa);
   569         da = MmxNegate(da);
   570         MmxAddMul(d, a, s, da);
   571         *dest = MmxFrom(d);
   572         ++src;
   573         ++dest;
   574         ++mask;
   575     }
   576     _mm_empty();
   577 }
   578 
   579 static FASTCALL void
   580 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   581 {
   582     const __m64 mmx_0 = _mm_setzero_si64();
   583     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   584     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   585     
   586     const CARD32 *end = src + width;
   587     while (src < end) {
   588         __m64 a = MmxTo(*mask);
   589         __m64 s = MmxTo(*src);
   590         __m64 d = MmxTo(*dest);
   591         __m64 da = MmxAlpha(d);
   592         __m64 sa = MmxAlpha(s);
   593         MmxMul(s, a);
   594         MmxMul(a, sa);
   595         da = MmxNegate(da);
   596         a = MmxNegate(a);
   597         MmxAddMul(d, a, s, da);
   598         *dest = MmxFrom(d);
   599         ++src;
   600         ++dest;
   601         ++mask;
   602     }
   603     _mm_empty();
   604 }
   605 
   606 static FASTCALL void
   607 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   608 {
   609     const __m64 mmx_0 = _mm_setzero_si64();
   610     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   611     
   612     const CARD32 *end = src + width;
   613     while (src < end) {
   614         __m64 a = MmxTo(*mask);
   615         __m64 s = MmxTo(*src);
   616         __m64 d = MmxTo(*dest);
   617         MmxMul(s, a);
   618         d = MmxAdd(s, d);
   619         *dest = MmxFrom(d);
   620         ++src;
   621         ++dest;
   622         ++mask;
   623     }
   624     _mm_empty();
   625 }
   626 
   627 extern FbComposeFunctions composeFunctions;
   628 
   629 void fbComposeSetupMMX(void)
   630 {
   631     /* check if we have MMX support and initialize accordingly */
   632     if (fbHaveMMX()) {
   633         composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
   634         composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
   635         composeFunctions.combineU[PictOpIn] = mmxCombineInU;
   636         composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
   637         composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
   638         composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
   639         composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
   640         composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
   641         composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
   642         composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
   643         composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
   644 
   645         composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
   646         composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
   647         composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
   648         composeFunctions.combineC[PictOpIn] = mmxCombineInC;
   649         composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
   650         composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
   651         composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
   652         composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
   653         composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
   654         composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
   655         composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
   656 
   657         composeFunctions.combineMaskU = mmxCombineMaskU;
   658     } 
   659 }
   660 #endif
   661 
   662 
   663 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
   664 
   665 typedef union {
   666   __m64 m64;
   667   uint64_t ull;
   668 } m64_ull;
   669 
   670 typedef struct
   671 {
   672     m64_ull mmx_4x00ff;
   673     m64_ull mmx_4x0080;
   674     m64_ull mmx_565_rgb;
   675     m64_ull mmx_565_unpack_multiplier;
   676     m64_ull mmx_565_r;
   677     m64_ull mmx_565_g;
   678     m64_ull mmx_565_b;
   679     m64_ull mmx_mask_0;
   680     m64_ull mmx_mask_1;
   681     m64_ull mmx_mask_2;
   682     m64_ull mmx_mask_3;
   683     m64_ull mmx_full_alpha;
   684     m64_ull mmx_ffff0000ffff0000;
   685     m64_ull mmx_0000ffff00000000;
   686     m64_ull mmx_000000000000ffff;
   687 } MMXData;
   688 
   689 static const MMXData c =
   690 {
   691     .mmx_4x00ff.ull =			0x00ff00ff00ff00ffULL,
   692     .mmx_4x0080.ull =			0x0080008000800080ULL,
   693     .mmx_565_rgb.ull =			0x000001f0003f001fULL,
   694     .mmx_565_r.ull =			0x000000f800000000ULL,
   695     .mmx_565_g.ull =			0x0000000000fc0000ULL,
   696     .mmx_565_b.ull =			0x00000000000000f8ULL,
   697     .mmx_mask_0.ull =			0xffffffffffff0000ULL,
   698     .mmx_mask_1.ull =			0xffffffff0000ffffULL,
   699     .mmx_mask_2.ull =			0xffff0000ffffffffULL,
   700     .mmx_mask_3.ull =			0x0000ffffffffffffULL,
   701     .mmx_full_alpha.ull =			0x00ff000000000000ULL,
   702     .mmx_565_unpack_multiplier.ull =	0x0000008404100840ULL,
   703     .mmx_ffff0000ffff0000.ull =		0xffff0000ffff0000ULL,
   704     .mmx_0000ffff00000000.ull =		0x0000ffff00000000ULL,
   705     .mmx_000000000000ffff.ull =		0x000000000000ffffULL,
   706 };
   707 
   708 #define MC(x) ((__m64) c.mmx_##x.m64)
   709 
   710 static __inline__ __m64
   711 shift (__m64 v, int s)
   712 {
   713     if (s > 0)
   714 	return _mm_slli_si64 (v, s);
   715     else if (s < 0)
   716 	return _mm_srli_si64 (v, -s);
   717     else
   718 	return v;
   719 }
   720 
   721 static __inline__ __m64
   722 negate (__m64 mask)
   723 {
   724     return _mm_xor_si64 (mask, MC(4x00ff));
   725 }
   726 
   727 static __inline__ __m64
   728 pix_multiply (__m64 a, __m64 b)
   729 {
   730     __m64 res;
   731     
   732     res = _mm_mullo_pi16 (a, b);
   733     res = _mm_adds_pu16 (res, MC(4x0080));
   734     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
   735     res = _mm_srli_pi16 (res, 8);
   736     
   737     return res;
   738 }
   739 
   740 static __inline__ __m64
   741 expand_alpha (__m64 pixel)
   742 {
   743     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
   744 }
   745 
   746 static __inline__ __m64
   747 expand_alpha_rev (__m64 pixel)
   748 {
   749     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
   750 }    
   751 
   752 static __inline__ __m64
   753 invert_colors (__m64 pixel)
   754 {
   755     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
   756 }
   757 
   758 /* Notes about writing mmx code
   759  *
   760  * give memory operands as the second operand. If you give it as the
   761  * first, gcc will first load it into a register, then use that
   762  * register
   763  *
   764  *   ie. use
   765  *
   766  *         _mm_mullo_pi16 (x, mmx_constant);
   767  *
   768  *   not
   769  *
   770  *         _mm_mullo_pi16 (mmx_constant, x);
   771  *
   772  * Also try to minimize dependencies. i.e. when you need a value, try
   773  * to calculate it from a value that was calculated as early as
   774  * possible.
   775  */
   776 
   777 static __inline__ __m64
   778 over (__m64 src, __m64 srca, __m64 dest)
   779 {
   780     return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
   781 }
   782 
   783 static __inline__ __m64
   784 over_rev_non_pre (__m64 src, __m64 dest)
   785 {
   786     __m64 srca = expand_alpha (src);
   787     __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
   788     
   789     return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
   790 }
   791 
   792 static __inline__ __m64
   793 in (__m64 src,
   794     __m64 mask)
   795 {
   796     return pix_multiply (src, mask);
   797 }
   798 
   799 static __inline__ __m64
   800 in_over (__m64 src,
   801 	 __m64 srca,
   802 	 __m64 mask,
   803 	 __m64 dest)
   804 {
   805     return over(in(src, mask), pix_multiply(srca, mask), dest);
   806 }
   807 
   808 static __inline__ __m64
   809 load8888 (CARD32 v)
   810 {
   811     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
   812 }
   813 
   814 static __inline__ __m64
   815 pack8888 (__m64 lo, __m64 hi)
   816 {
   817     __m64 r;
   818     r = _mm_packs_pu16 (lo, hi);
   819     return r;
   820 }
   821 
   822 static __inline__ CARD32
   823 store8888 (__m64 v)
   824 {
   825     return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
   826 }
   827 
   828 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
   829  *
   830  *    00RR00GG00BB
   831  * 
   832  * --- Expanding 565 in the low word ---
   833  * 
   834  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
   835  * m = m & (01f0003f001f);
   836  * m = m * (008404100840);
   837  * m = m >> 8;
   838  * 
   839  * Note the trick here - the top word is shifted by another nibble to
   840  * avoid it bumping into the middle word
   841  */
   842 static __inline__ __m64
   843 expand565 (__m64 pixel, int pos)
   844 {
   845     __m64 p = pixel;
   846     __m64 t1, t2;
   847     
   848     /* move pixel to low 16 bit and zero the rest */
   849     p = shift (shift (p, (3 - pos) * 16), -48); 
   850     
   851     t1 = shift (p, 36 - 11);
   852     t2 = shift (p, 16 - 5);
   853     
   854     p = _mm_or_si64 (t1, p);
   855     p = _mm_or_si64 (t2, p);
   856     p = _mm_and_si64 (p, MC(565_rgb));
   857     
   858     pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
   859     return _mm_srli_pi16 (pixel, 8);
   860 }
   861 
   862 static __inline__ __m64
   863 expand8888 (__m64 in, int pos)
   864 {
   865     if (pos == 0)
   866 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
   867     else
   868 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
   869 }
   870 
   871 static __inline__ __m64
   872 pack565 (__m64 pixel, __m64 target, int pos)
   873 {
   874     __m64 p = pixel;
   875     __m64 t = target;
   876     __m64 r, g, b;
   877     
   878     r = _mm_and_si64 (p, MC(565_r));
   879     g = _mm_and_si64 (p, MC(565_g));
   880     b = _mm_and_si64 (p, MC(565_b));
   881     
   882     r = shift (r, - (32 - 8) + pos * 16);
   883     g = shift (g, - (16 - 3) + pos * 16);
   884     b = shift (b, - (0  + 3) + pos * 16);
   885     
   886     if (pos == 0)
   887 	t = _mm_and_si64 (t, MC(mask_0));
   888     else if (pos == 1)
   889 	t = _mm_and_si64 (t, MC(mask_1));
   890     else if (pos == 2)
   891 	t = _mm_and_si64 (t, MC(mask_2));
   892     else if (pos == 3)
   893 	t = _mm_and_si64 (t, MC(mask_3));
   894     
   895     p = _mm_or_si64 (r, t);
   896     p = _mm_or_si64 (g, p);
   897     
   898     return _mm_or_si64 (b, p);
   899 }
   900 
   901 #ifdef ENABLE_BROKEN_IMPLS
   902 /* broken.  See Debian bug #340932 */
   903 static void
   904 fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
   905 {
   906     __m64	vsrc, vsrca;
   907 
   908     vsrc = load8888 (*src);
   909     vsrca = expand_alpha (vsrc);
   910 
   911     while (w && (unsigned long)dst & 7)
   912     {
   913         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
   914         
   915         w--;
   916         dst++;
   917     }
   918     
   919     while (w >= 2)
   920     {
   921         __m64 vdest;
   922         __m64 dest0, dest1;
   923         
   924         vdest = *(__m64 *)dst;
   925         
   926         dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
   927         dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
   928         
   929         *(__m64 *)dst = pack8888(dest0, dest1);
   930         
   931         dst += 2;
   932         w -= 2;
   933     }
   934     
   935     while (w)
   936     {
   937         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
   938         
   939         w--;
   940         dst++;
   941     }
   942     
   943     _mm_empty();
   944 }
   945 OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
   946     OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
   947 #endif
   948 
   949 #if 0
   950 void
   951 fbCompositeSolid_nx0565mmx (CARD8	op,
   952 			    PicturePtr pSrc,
   953 			    PicturePtr pMask,
   954 			    PicturePtr pDst,
   955 			    INT16	xSrc,
   956 			    INT16	ySrc,
   957 			    INT16	xMask,
   958 			    INT16	yMask,
   959 			    INT16	xDst,
   960 			    INT16	yDst,
   961 			    CARD16	width,
   962 			    CARD16	height)
   963 {
   964     CARD32	src;
   965     CARD16	*dstLine, *dst;
   966     CARD16	w;
   967     FbStride	dstStride;
   968     __m64	vsrc, vsrca;
   969     
   970     CHECKPOINT();
   971     
   972     fbComposeGetSolid(pSrc, src, pDst->format);
   973     
   974     if (src >> 24 == 0)
   975 	return;
   976     
   977     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
   978     
   979     vsrc = load8888 (src);
   980     vsrca = expand_alpha (vsrc);
   981     
   982     while (height--)
   983     {
   984 	dst = dstLine;
   985 	dstLine += dstStride;
   986 	w = width;
   987 	
   988 	CHECKPOINT();
   989 	
   990 	while (w && (unsigned long)dst & 7)
   991 	{
   992 	    ullong d = *dst;
   993 	    __m64 vdest = expand565 ((__m64)d, 0);
   994 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
   995 	    *dst = (ullong)vdest;
   996 	    
   997 	    w--;
   998 	    dst++;
   999 	}
  1000 	
  1001 	while (w >= 4)
  1002 	{
  1003 	    __m64 vdest;
  1004 	    
  1005 	    vdest = *(__m64 *)dst;
  1006 	    
  1007 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
  1008 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
  1009 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
  1010 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
  1011 	    
  1012 	    *(__m64 *)dst = vdest;
  1013 	    
  1014 	    dst += 4;
  1015 	    w -= 4;
  1016 	}
  1017 	
  1018 	CHECKPOINT();
  1019 	
  1020 	while (w)
  1021 	{
  1022 	    ullong d = *dst;
  1023 	    __m64 vdest = expand565 ((__m64)d, 0);
  1024 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
  1025 	    *dst = (ullong)vdest;
  1026 	    
  1027 	    w--;
  1028 	    dst++;
  1029 	}
  1030     }
  1031     
  1032     _mm_empty();
  1033 }
  1034 #endif
  1035 
  1036 #if 0
  1037 static void
  1038 fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
  1039 {
  1040     CARD32	src, srca;
  1041     CARD32	*dstLine;
  1042     CARD32	*maskLine;
  1043     FbStride	dstStride, maskStride;
  1044     __m64	vsrc, vsrca;
  1045     
  1046     
  1047     while (twidth && (unsigned long)q & 7)
  1048     {
  1049         CARD32 m = *(CARD32 *)p;
  1050         
  1051         if (m)
  1052         {
  1053             __m64 vdest = load8888(*q);
  1054             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
  1055             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
  1056         }
  1057         
  1058         twidth--;
  1059         p++;
  1060         q++;
  1061     }
  1062     
  1063     while (twidth >= 2)
  1064     {
  1065         CARD32 m0, m1;
  1066         m0 = *p;
  1067         m1 = *(p + 1);
  1068         
  1069         if (m0 | m1)
  1070         {
  1071             __m64 dest0, dest1;
  1072             __m64 vdest = *(__m64 *)q;
  1073             
  1074             dest0 = in_over(vsrc, vsrca, load8888(m0),
  1075                             expand8888 (vdest, 0));
  1076             dest1 = in_over(vsrc, vsrca, load8888(m1),
  1077                             expand8888 (vdest, 1));
  1078             
  1079             *(__m64 *)q = pack8888(dest0, dest1);
  1080         }
  1081         
  1082         p += 2;
  1083         q += 2;
  1084         twidth -= 2;
  1085     }
  1086     
  1087     while (twidth)
  1088     {
  1089         CARD32 m = *(CARD32 *)p;
  1090         
  1091         if (m)
  1092         {
  1093             __m64 vdest = load8888(*q);
  1094             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
  1095             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
  1096         }
  1097         
  1098         twidth--;
  1099         p++;
  1100         q++;
  1101     }
  1102     
  1103     _mm_empty();
  1104 }
  1105 #endif
  1106 
  1107 #if 0
  1108 static void
  1109 fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
  1110     int width)
  1111 {
  1112 
  1113     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
  1114     vmask = load8888 (mask);
  1115     srca = MC(4x00ff);
  1116     
  1117     while (height--)
  1118     {
  1119 	dst = dstLine;
  1120 	dstLine += dstStride;
  1121 	src = srcLine;
  1122 	srcLine += srcStride;
  1123 	w = width;
  1124 
  1125 	while (w && (unsigned long)dst & 7)
  1126 	{
  1127 	    __m64 s = load8888 (*src);
  1128 	    __m64 d = load8888 (*dst);
  1129 	    
  1130 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
  1131 	    
  1132 	    w--;
  1133 	    dst++;
  1134 	    src++;
  1135 	}
  1136 
  1137 	while (w >= 16)
  1138 	{
  1139 	    __m64 vd0 = *(__m64 *)(dst + 0);
  1140 	    __m64 vd1 = *(__m64 *)(dst + 2);
  1141 	    __m64 vd2 = *(__m64 *)(dst + 4);
  1142 	    __m64 vd3 = *(__m64 *)(dst + 6);
  1143 	    __m64 vd4 = *(__m64 *)(dst + 8);
  1144 	    __m64 vd5 = *(__m64 *)(dst + 10);
  1145 	    __m64 vd6 = *(__m64 *)(dst + 12);
  1146 	    __m64 vd7 = *(__m64 *)(dst + 14);
  1147 
  1148 	    __m64 vs0 = *(__m64 *)(src + 0);
  1149 	    __m64 vs1 = *(__m64 *)(src + 2);
  1150 	    __m64 vs2 = *(__m64 *)(src + 4);
  1151 	    __m64 vs3 = *(__m64 *)(src + 6);
  1152 	    __m64 vs4 = *(__m64 *)(src + 8);
  1153 	    __m64 vs5 = *(__m64 *)(src + 10);
  1154 	    __m64 vs6 = *(__m64 *)(src + 12);
  1155 	    __m64 vs7 = *(__m64 *)(src + 14);
  1156 
  1157 	    vd0 = (__m64)pack8888 (
  1158 		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
  1159 		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
  1160 	
  1161 	    vd1 = (__m64)pack8888 (
  1162 		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
  1163 		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
  1164 	
  1165 	    vd2 = (__m64)pack8888 (
  1166 		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
  1167 		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
  1168 	
  1169 	    vd3 = (__m64)pack8888 (
  1170 		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
  1171 		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
  1172 	
  1173 	    vd4 = (__m64)pack8888 (
  1174 		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
  1175 		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
  1176 	
  1177 	    vd5 = (__m64)pack8888 (
  1178 		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
  1179 		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
  1180 	
  1181 	    vd6 = (__m64)pack8888 (
  1182 		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
  1183 		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
  1184 	
  1185 	    vd7 = (__m64)pack8888 (
  1186 		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
  1187 		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
  1188 
  1189     	    *(__m64 *)(dst + 0) = vd0;
  1190 	    *(__m64 *)(dst + 2) = vd1;
  1191 	    *(__m64 *)(dst + 4) = vd2;
  1192 	    *(__m64 *)(dst + 6) = vd3;
  1193 	    *(__m64 *)(dst + 8) = vd4;
  1194 	    *(__m64 *)(dst + 10) = vd5;
  1195 	    *(__m64 *)(dst + 12) = vd6;
  1196 	    *(__m64 *)(dst + 14) = vd7;
  1197 	
  1198 	    w -= 16;
  1199 	    dst += 16;
  1200 	    src += 16;
  1201 	}
  1202 	
  1203 	while (w)
  1204 	{
  1205 	    __m64 s = load8888 (*src);
  1206 	    __m64 d = load8888 (*dst);
  1207 	    
  1208 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
  1209 	    
  1210 	    w--;
  1211 	    dst++;
  1212 	    src++;
  1213 	}
  1214     }
  1215 
  1216     _mm_empty(); 
  1217 }
  1218 
  1219 void
  1220 fbCompositeSrc_8888x8888mmx (CARD8	op,
  1221 			     PicturePtr pSrc,
  1222 			     PicturePtr pMask,
  1223 			     PicturePtr pDst,
  1224 			     INT16	xSrc,
  1225 			     INT16	ySrc,
  1226 			     INT16      xMask,
  1227 			     INT16      yMask,
  1228 			     INT16      xDst,
  1229 			     INT16      yDst,
  1230 			     CARD16     width,
  1231 			     CARD16     height)
  1232 {
  1233     CARD32	*dstLine, *dst;
  1234     CARD32	*srcLine, *src;
  1235     FbStride	dstStride, srcStride;
  1236     CARD16	w;
  1237     __m64  srca;
  1238     
  1239     CHECKPOINT();
  1240     
  1241     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1242     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1243 
  1244     srca = MC (4x00ff);
  1245     
  1246     while (height--)
  1247     {
  1248 	dst = dstLine;
  1249 	dstLine += dstStride;
  1250 	src = srcLine;
  1251 	srcLine += srcStride;
  1252 	w = width;
  1253 
  1254 	while (w && (unsigned long)dst & 7)
  1255 	{
  1256 	    __m64 s = load8888 (*src);
  1257 	    __m64 d = load8888 (*dst);
  1258 	    
  1259 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
  1260 	    
  1261 	    w--;
  1262 	    dst++;
  1263 	    src++;
  1264 	}
  1265 
  1266 	while (w >= 2)
  1267 	{
  1268 	    __m64 vd = *(__m64 *)(dst + 0);
  1269 	    __m64 vs = *(__m64 *)(src + 0);
  1270 	    __m64 vs0 = expand8888 (vs, 0);
  1271 	    __m64 vs1 = expand8888 (vs, 1);
  1272 
  1273 	    *(__m64 *)dst = (__m64)pack8888 (
  1274 		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
  1275 		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
  1276 	    
  1277 	    w -= 2;
  1278 	    dst += 2;
  1279 	    src += 2;
  1280 	}
  1281 	
  1282 	while (w)
  1283 	{
  1284 	    __m64 s = load8888 (*src);
  1285 	    __m64 d = load8888 (*dst);
  1286 	    
  1287 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
  1288 				     (__m64)_mm_setzero_si64());
  1289 	    
  1290 	    w--;
  1291 	    dst++;
  1292 	    src++;
  1293 	}
  1294     }
  1295 
  1296     _mm_empty(); 
  1297 }
  1298 
  1299 void
  1300 fbCompositeSolidMask_nx8x8888mmx (CARD8      op,
  1301 				  PicturePtr pSrc,
  1302 				  PicturePtr pMask,
  1303 				  PicturePtr pDst,
  1304 				  INT16      xSrc,
  1305 				  INT16      ySrc,
  1306 				  INT16      xMask,
  1307 				  INT16      yMask,
  1308 				  INT16      xDst,
  1309 				  INT16      yDst,
  1310 				  CARD16     width,
  1311 				  CARD16     height)
  1312 {
  1313     CARD32	src, srca;
  1314     CARD32	*dstLine, *dst;
  1315     CARD8	*maskLine, *mask;
  1316     FbStride	dstStride, maskStride;
  1317     CARD16	w;
  1318     __m64	vsrc, vsrca;
  1319     ullong	srcsrc;
  1320     
  1321     CHECKPOINT();
  1322     
  1323     fbComposeGetSolid(pSrc, src, pDst->format);
  1324     
  1325     srca = src >> 24;
  1326     if (srca == 0)
  1327 	return;
  1328     
  1329     srcsrc = (unsigned long long)src << 32 | src;
  1330     
  1331     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1332     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
  1333     
  1334     vsrc = load8888 (src);
  1335     vsrca = expand_alpha (vsrc);
  1336     
  1337     while (height--)
  1338     {
  1339 	dst = dstLine;
  1340 	dstLine += dstStride;
  1341 	mask = maskLine;
  1342 	maskLine += maskStride;
  1343 	w = width;
  1344 	
  1345 	CHECKPOINT();
  1346 	
  1347 	while (w && (unsigned long)dst & 7)
  1348 	{
  1349 	    ullong m = *mask;
  1350 	    
  1351 	    if (m)
  1352 	    {
  1353 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
  1354 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
  1355 	    }
  1356 	    
  1357 	    w--;
  1358 	    mask++;
  1359 	    dst++;
  1360 	}
  1361 	
  1362 	CHECKPOINT();
  1363 	
  1364 	while (w >= 2)
  1365 	{
  1366 	    ullong m0, m1;
  1367 	    m0 = *mask;
  1368 	    m1 = *(mask + 1);
  1369 	    
  1370 	    if (srca == 0xff && (m0 & m1) == 0xff)
  1371 	    {
  1372 		*(unsigned long long *)dst = srcsrc;
  1373 	    }
  1374 	    else if (m0 | m1)
  1375 	    {
  1376 		__m64 vdest;
  1377 		__m64 dest0, dest1;
  1378 		
  1379 		vdest = *(__m64 *)dst;
  1380 		
  1381 		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
  1382 		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
  1383 		
  1384 		*(__m64 *)dst = pack8888(dest0, dest1);
  1385 	    }
  1386 	    
  1387 	    mask += 2;
  1388 	    dst += 2;
  1389 	    w -= 2;
  1390 	}
  1391 	
  1392 	CHECKPOINT();
  1393 	
  1394 	while (w)
  1395 	{
  1396 	    ullong m = *mask;
  1397 	    
  1398 	    if (m)
  1399 	    {
  1400 		__m64 vdest = load8888(*dst);
  1401 		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
  1402 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
  1403 	    }
  1404 	    
  1405 	    w--;
  1406 	    mask++;
  1407 	    dst++;
  1408 	}
  1409     }
  1410     
  1411     _mm_empty();
  1412 }
  1413 
  1414 
  1415 void
  1416 fbCompositeSolidMask_nx8x0565mmx (CARD8      op,
  1417 				  PicturePtr pSrc,
  1418 				  PicturePtr pMask,
  1419 				  PicturePtr pDst,
  1420 				  INT16      xSrc,
  1421 				  INT16      ySrc,
  1422 				  INT16      xMask,
  1423 				  INT16      yMask,
  1424 				  INT16      xDst,
  1425 				  INT16      yDst,
  1426 				  CARD16     width,
  1427 				  CARD16     height)
  1428 {
  1429     CARD32	src, srca;
  1430     CARD16	*dstLine, *dst;
  1431     CARD8	*maskLine, *mask;
  1432     FbStride	dstStride, maskStride;
  1433     CARD16	w;
  1434     __m64	vsrc, vsrca;
  1435     unsigned long long srcsrcsrcsrc, src16;
  1436     
  1437     CHECKPOINT();
  1438     
  1439     fbComposeGetSolid(pSrc, src, pDst->format);
  1440     
  1441     srca = src >> 24;
  1442     if (srca == 0)
  1443 	return;
  1444     
  1445     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1446     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
  1447     
  1448     vsrc = load8888 (src);
  1449     vsrca = expand_alpha (vsrc);
  1450     
  1451     src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
  1452     
  1453     srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
  1454 	(ullong)src16 << 16 | (ullong)src16;
  1455     
  1456     while (height--)
  1457     {
  1458 	dst = dstLine;
  1459 	dstLine += dstStride;
  1460 	mask = maskLine;
  1461 	maskLine += maskStride;
  1462 	w = width;
  1463 	
  1464 	CHECKPOINT();
  1465 	
  1466 	while (w && (unsigned long)dst & 7)
  1467 	{
  1468 	    ullong m = *mask;
  1469 	    
  1470 	    if (m)
  1471 	    {
  1472 		ullong d = *dst;
  1473 		__m64 vd = (__m64)d;
  1474 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
  1475 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
  1476 	    }
  1477 	    
  1478 	    w--;
  1479 	    mask++;
  1480 	    dst++;
  1481 	}
  1482 	
  1483 	CHECKPOINT();
  1484 	
  1485 	while (w >= 4)
  1486 	{
  1487 	    ullong m0, m1, m2, m3;
  1488 	    m0 = *mask;
  1489 	    m1 = *(mask + 1);
  1490 	    m2 = *(mask + 2);
  1491 	    m3 = *(mask + 3);
  1492 	    
  1493 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
  1494 	    {
  1495 		*(unsigned long long *)dst = srcsrcsrcsrc;
  1496 	    }
  1497 	    else if (m0 | m1 | m2 | m3)
  1498 	    {
  1499 		__m64 vdest;
  1500 		__m64 vm0, vm1, vm2, vm3;
  1501 		
  1502 		vdest = *(__m64 *)dst;
  1503 		
  1504 		vm0 = (__m64)m0;
  1505 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
  1506 		vm1 = (__m64)m1;
  1507 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
  1508 		vm2 = (__m64)m2;
  1509 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
  1510 		vm3 = (__m64)m3;
  1511 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
  1512 		
  1513 		*(__m64 *)dst = vdest;
  1514 	    }
  1515 	    
  1516 	    w -= 4;
  1517 	    mask += 4;
  1518 	    dst += 4;
  1519 	}
  1520 	
  1521 	CHECKPOINT();
  1522 	
  1523 	while (w)
  1524 	{
  1525 	    ullong m = *mask;
  1526 	    
  1527 	    if (m)
  1528 	    {
  1529 		ullong d = *dst;
  1530 		__m64 vd = (__m64)d;
  1531 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
  1532 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
  1533 	    }
  1534 	    
  1535 	    w--;
  1536 	    mask++;
  1537 	    dst++;
  1538 	}
  1539     }
  1540     
  1541     _mm_empty();
  1542 }
  1543 
  1544 void
  1545 fbCompositeSrc_8888RevNPx0565mmx (CARD8      op,
  1546 				  PicturePtr pSrc,
  1547 				  PicturePtr pMask,
  1548 				  PicturePtr pDst,
  1549 				  INT16      xSrc,
  1550 				  INT16      ySrc,
  1551 				  INT16      xMask,
  1552 				  INT16      yMask,
  1553 				  INT16      xDst,
  1554 				  INT16      yDst,
  1555 				  CARD16     width,
  1556 				  CARD16     height)
  1557 {
  1558     CARD16	*dstLine, *dst;
  1559     CARD32	*srcLine, *src;
  1560     FbStride	dstStride, srcStride;
  1561     CARD16	w;
  1562     
  1563     CHECKPOINT();
  1564     
  1565     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1566     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1567     
  1568     assert (pSrc->pDrawable == pMask->pDrawable);
  1569     
  1570     while (height--)
  1571     {
  1572 	dst = dstLine;
  1573 	dstLine += dstStride;
  1574 	src = srcLine;
  1575 	srcLine += srcStride;
  1576 	w = width;
  1577 	
  1578 	CHECKPOINT();
  1579 	
  1580 	while (w && (unsigned long)dst & 7)
  1581 	{
  1582 	    __m64 vsrc = load8888 (*src);
  1583 	    ullong d = *dst;
  1584 	    __m64 vdest = expand565 ((__m64)d, 0);
  1585 	    
  1586 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
  1587 	    
  1588 	    *dst = (ullong)vdest;
  1589 	    
  1590 	    w--;
  1591 	    dst++;
  1592 	    src++;
  1593 	}
  1594 	
  1595 	CHECKPOINT();
  1596 	
  1597 	while (w >= 4)
  1598 	{
  1599 	    CARD32 s0, s1, s2, s3;
  1600 	    unsigned char a0, a1, a2, a3;
  1601 	    
  1602 	    s0 = *src;
  1603 	    s1 = *(src + 1);
  1604 	    s2 = *(src + 2);
  1605 	    s3 = *(src + 3);
  1606 	    
  1607 	    a0 = (s0 >> 24);
  1608 	    a1 = (s1 >> 24);
  1609 	    a2 = (s2 >> 24);
  1610 	    a3 = (s3 >> 24);
  1611 	    
  1612 	    if ((a0 & a1 & a2 & a3) == 0xFF)
  1613 	    {
  1614 		__m64 vdest;
  1615 		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
  1616 		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
  1617 		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
  1618 		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
  1619 		
  1620 		*(__m64 *)dst = vdest;
  1621 	    }
  1622 	    else if (a0 | a1 | a2 | a3)
  1623 	    {
  1624 		__m64 vdest = *(__m64 *)dst;
  1625 		
  1626 		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
  1627 	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
  1628 		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
  1629 		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
  1630 		
  1631 		*(__m64 *)dst = vdest;
  1632 	    }
  1633 	    
  1634 	    w -= 4;
  1635 	    dst += 4;
  1636 	    src += 4;
  1637 	}
  1638 	
  1639 	CHECKPOINT();
  1640 	
  1641 	while (w)
  1642 	{
  1643 	    __m64 vsrc = load8888 (*src);
  1644 	    ullong d = *dst;
  1645 	    __m64 vdest = expand565 ((__m64)d, 0);
  1646 	    
  1647 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
  1648 	    
  1649 	    *dst = (ullong)vdest;
  1650 	    
  1651 	    w--;
  1652 	    dst++;
  1653 	    src++;
  1654 	}
  1655     }
  1656     
  1657     _mm_empty();
  1658 }
  1659 
  1660 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
  1661 
  1662 void
  1663 fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
  1664 				  PicturePtr pSrc,
  1665 				  PicturePtr pMask,
  1666 				  PicturePtr pDst,
  1667 				  INT16      xSrc,
  1668 				  INT16      ySrc,
  1669 				  INT16      xMask,
  1670 				  INT16      yMask,
  1671 				  INT16      xDst,
  1672 				  INT16      yDst,
  1673 				  CARD16     width,
  1674 				  CARD16     height)
  1675 {
  1676     CARD32	*dstLine, *dst;
  1677     CARD32	*srcLine, *src;
  1678     FbStride	dstStride, srcStride;
  1679     CARD16	w;
  1680     
  1681     CHECKPOINT();
  1682     
  1683     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1684     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1685     
  1686     assert (pSrc->pDrawable == pMask->pDrawable);
  1687     
  1688     while (height--)
  1689     {
  1690 	dst = dstLine;
  1691 	dstLine += dstStride;
  1692 	src = srcLine;
  1693 	srcLine += srcStride;
  1694 	w = width;
  1695 	
  1696 	while (w && (unsigned long)dst & 7)
  1697 	{
  1698 	    __m64 s = load8888 (*src);
  1699 	    __m64 d = load8888 (*dst);
  1700 	    
  1701 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
  1702 	    
  1703 	    w--;
  1704 	    dst++;
  1705 	    src++;
  1706 	}
  1707 	
  1708 	while (w >= 2)
  1709 	{
  1710 	    ullong s0, s1;
  1711 	    unsigned char a0, a1;
  1712 	    __m64 d0, d1;
  1713 	    
  1714 	    s0 = *src;
  1715 	    s1 = *(src + 1);
  1716 	    
  1717 	    a0 = (s0 >> 24);
  1718 	    a1 = (s1 >> 24);
  1719 	    
  1720 	    if ((a0 & a1) == 0xFF)
  1721 	    {
  1722 		d0 = invert_colors(load8888(s0));
  1723 		d1 = invert_colors(load8888(s1));
  1724 		
  1725 		*(__m64 *)dst = pack8888 (d0, d1);
  1726 	    }
  1727 	    else if (a0 | a1)
  1728 	    {
  1729 		__m64 vdest = *(__m64 *)dst;
  1730 		
  1731 		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
  1732 		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
  1733 		
  1734 		*(__m64 *)dst = pack8888 (d0, d1);
  1735 	    }
  1736 	    
  1737 	    w -= 2;
  1738 	    dst += 2;
  1739 	    src += 2;
  1740 	}
  1741 	
  1742 	while (w)
  1743 	{
  1744 	    __m64 s = load8888 (*src);
  1745 	    __m64 d = load8888 (*dst);
  1746 	    
  1747 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
  1748 	    
  1749 	    w--;
  1750 	    dst++;
  1751 	    src++;
  1752 	}
  1753     }
  1754     
  1755     _mm_empty();
  1756 }
  1757 
  1758 void
  1759 fbCompositeSolidMask_nx8888x0565Cmmx (CARD8      op,
  1760 				      PicturePtr pSrc,
  1761 				      PicturePtr pMask,
  1762 				      PicturePtr pDst,
  1763 				      INT16      xSrc,
  1764 				      INT16      ySrc,
  1765 				      INT16      xMask,
  1766 				      INT16      yMask,
  1767 				      INT16      xDst,
  1768 				      INT16      yDst,
  1769 				      CARD16     width,
  1770 				      CARD16     height)
  1771 {
  1772     CARD32	src, srca;
  1773     CARD16	*dstLine;
  1774     CARD32	*maskLine;
  1775     FbStride	dstStride, maskStride;
  1776     __m64  vsrc, vsrca;
  1777     
  1778     CHECKPOINT();
  1779     
  1780     fbComposeGetSolid(pSrc, src, pDst->format);
  1781     
  1782     srca = src >> 24;
  1783     if (srca == 0)
  1784 	return;
  1785     
  1786     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1787     fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
  1788     
  1789     vsrc = load8888 (src);
  1790     vsrca = expand_alpha (vsrc);
  1791     
  1792     while (height--)
  1793     {
  1794 	int twidth = width;
  1795 	CARD32 *p = (CARD32 *)maskLine;
  1796 	CARD16 *q = (CARD16 *)dstLine;
  1797 	
  1798 	while (twidth && ((unsigned long)q & 7))
  1799 	{
  1800 	    CARD32 m = *(CARD32 *)p;
  1801 	    
  1802 	    if (m)
  1803 	    {
  1804 		ullong d = *q;
  1805 		__m64 vdest = expand565 ((__m64)d, 0);
  1806 		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
  1807 		*q = (ullong)vdest;
  1808 	    }
  1809 	    
  1810 	    twidth--;
  1811 	    p++;
  1812 	    q++;
  1813 	}
  1814 	
  1815 	while (twidth >= 4)
  1816 	{
  1817 	    CARD32 m0, m1, m2, m3;
  1818 	    
  1819 	    m0 = *p;
  1820 	    m1 = *(p + 1);
  1821 	    m2 = *(p + 2);
  1822 	    m3 = *(p + 3);
  1823 	    
  1824 	    if ((m0 | m1 | m2 | m3))
  1825 	    {
  1826 		__m64 vdest = *(__m64 *)q;
  1827 		
  1828 		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
  1829 		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
  1830 		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
  1831 		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
  1832 		
  1833 		*(__m64 *)q = vdest;
  1834 	    }
  1835 	    twidth -= 4;
  1836 	    p += 4;
  1837 	    q += 4;
  1838 	}
  1839 	
  1840 	while (twidth)
  1841 	{
  1842 	    CARD32 m;
  1843 	    
  1844 	    m = *(CARD32 *)p;
  1845 	    if (m)
  1846 	    {
  1847 		ullong d = *q;
  1848 		__m64 vdest = expand565((__m64)d, 0);
  1849 		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
  1850 		*q = (ullong)vdest;
  1851 	    }
  1852 	    
  1853 	    twidth--;
  1854 	    p++;
  1855 	    q++;
  1856 	}
  1857 	
  1858 	maskLine += maskStride;
  1859 	dstLine += dstStride;
  1860     }
  1861     
  1862     _mm_empty ();
  1863 }
  1864 #endif
  1865 
  1866 static void
  1867 fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
  1868 {
  1869     int s;
  1870     int d;
  1871     int t;
  1872 
  1873     while (w && (unsigned long)dst & 7)
  1874     {
  1875         s = *src;
  1876         d = *dst;
  1877         t = d + s;
  1878         s = t | (0 - (t >> 8));
  1879         *dst = s;
  1880         
  1881         dst++;
  1882         src++;
  1883         w--;
  1884     }
  1885     
  1886     while (w >= 8)
  1887     {
  1888         *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
  1889         dst += 8;
  1890         src += 8;
  1891         w -= 8;
  1892     }
  1893     
  1894     while (w)
  1895     {
  1896         s = *src;
  1897         d = *dst;
  1898         t = d + s;
  1899         s = t | (0 - (t >> 8));
  1900         *dst = s;
  1901         
  1902         dst++;
  1903         src++;
  1904         w--;
  1905     }
  1906 
  1907     _mm_empty();
  1908 }
  1909 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
  1910 
  1911 static void
  1912 fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
  1913 {
  1914     while (w && (unsigned long)dst & 7)
  1915     {
  1916         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
  1917                                              _mm_cvtsi32_si64(*dst)));
  1918         dst++;
  1919         src++;
  1920         w--;
  1921     }
  1922     
  1923     while (w >= 2)
  1924     {
  1925         *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
  1926         dst += 2;
  1927         src += 2;
  1928         w -= 2;
  1929     }
  1930     
  1931     if (w)
  1932     {
  1933         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
  1934                                              _mm_cvtsi32_si64(*dst)));
  1935         
  1936     }
  1937     
  1938     _mm_empty();
  1939 }
  1940 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
  1941 
  1942 #if 0
  1943 #define GetStart(drw,x,y,type,stride,line,bpp) {\
  1944     FbBits	*__bits__;									\
  1945     FbStride	__stride__;									\
  1946     int		__xoff__,__yoff__;								\
  1947 												\
  1948     fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__);				\
  1949     (stride) = __stride__ * sizeof (FbBits) / sizeof (type);					\
  1950     (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__);		\
  1951 }
  1952 
  1953 Bool
  1954 fbSolidFillmmx (DrawablePtr	pDraw,
  1955 		int		x,
  1956 		int		y,
  1957 		int		width,
  1958 		int		height,
  1959 		FbBits		xor)
  1960 { 
  1961     FbStride	stride;
  1962     int		bpp;
  1963     ullong	fill;
  1964     __m64	vfill;
  1965     CARD32	byte_width;
  1966     CARD8	*byte_line;
  1967     FbBits      *bits;
  1968     int		xoff, yoff;
  1969     
  1970     CHECKPOINT();
  1971     
  1972     fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
  1973     
  1974     if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
  1975 	return FALSE;
  1976     
  1977     if (bpp != 16 && bpp != 32)
  1978 	return FALSE;
  1979     
  1980     if (bpp == 16)
  1981     {
  1982 	stride = stride * sizeof (FbBits) / 2;
  1983 	byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
  1984 	byte_width = 2 * width;
  1985 	stride *= 2;
  1986     }
  1987     else
  1988     {
  1989 	stride = stride * sizeof (FbBits) / 4;
  1990 	byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
  1991 	byte_width = 4 * width;
  1992 	stride *= 4;
  1993     }
  1994     
  1995     fill = ((ullong)xor << 32) | xor;
  1996     vfill = (__m64)fill;
  1997     
  1998     while (height--)
  1999     {
  2000 	int w;
  2001 	CARD8 *d = byte_line;
  2002 	byte_line += stride;
  2003 	w = byte_width;
  2004 	
  2005 	while (w >= 2 && ((unsigned long)d & 3))
  2006 	{
  2007 	    *(CARD16 *)d = xor;
  2008 	    w -= 2;
  2009 	    d += 2;
  2010 	}
  2011 	
  2012 	while (w >= 4 && ((unsigned long)d & 7))
  2013 	{
  2014 	    *(CARD32 *)d = xor;
  2015 	    
  2016 	    w -= 4;
  2017 	    d += 4;
  2018 	}
  2019 	
  2020 	while (w >= 64)
  2021 	{
  2022 	    *(__m64*) (d +  0) = vfill;
  2023 	    *(__m64*) (d +  8) = vfill;
  2024 	    *(__m64*) (d + 16) = vfill;
  2025 	    *(__m64*) (d + 24) = vfill;
  2026 	    *(__m64*) (d + 32) = vfill;
  2027 	    *(__m64*) (d + 40) = vfill;
  2028 	    *(__m64*) (d + 48) = vfill;
  2029 	    *(__m64*) (d + 56) = vfill;
  2030 	    
  2031 	    w -= 64;
  2032 	    d += 64;
  2033 	}
  2034 	while (w >= 4)
  2035 	{
  2036 	    *(CARD32 *)d = xor;
  2037 	    
  2038 	    w -= 4;
  2039 	    d += 4;
  2040 	}
  2041 	if (w >= 2)
  2042 	{
  2043 	    *(CARD16 *)d = xor;
  2044 	    w -= 2;
  2045 	    d += 2;
  2046 	}
  2047     }
  2048     
  2049     _mm_empty();
  2050     return TRUE;
  2051 }
  2052 
  2053 Bool
  2054 fbCopyAreammx (DrawablePtr	pSrc,
  2055 	       DrawablePtr	pDst,
  2056 	       int		src_x,
  2057 	       int		src_y,
  2058 	       int		dst_x,
  2059 	       int		dst_y,
  2060 	       int		width,
  2061 	       int		height)
  2062 {
  2063     FbBits *	src_bits;
  2064     FbStride	src_stride;
  2065     int		src_bpp;
  2066     int		src_xoff;
  2067     int		src_yoff;
  2068 
  2069     FbBits *	dst_bits;
  2070     FbStride	dst_stride;
  2071     int		dst_bpp;
  2072     int		dst_xoff;
  2073     int		dst_yoff;
  2074 
  2075     CARD8 *	src_bytes;
  2076     CARD8 *	dst_bytes;
  2077     int		byte_width;
  2078     
  2079     fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
  2080     fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
  2081 
  2082     if (src_bpp != 16 && src_bpp != 32)
  2083 	return FALSE;
  2084 
  2085     if (dst_bpp != 16 && dst_bpp != 32)
  2086 	return FALSE;
  2087 
  2088     if (src_bpp != dst_bpp)
  2089     {
  2090 	return FALSE;
  2091     }
  2092     
  2093     if (src_bpp == 16)
  2094     {
  2095 	src_stride = src_stride * sizeof (FbBits) / 2;
  2096 	dst_stride = dst_stride * sizeof (FbBits) / 2;
  2097 	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
  2098 	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
  2099 	byte_width = 2 * width;
  2100 	src_stride *= 2;
  2101 	dst_stride *= 2;
  2102     }
  2103     else
  2104     {
  2105 	src_stride = src_stride * sizeof (FbBits) / 4;
  2106 	dst_stride = dst_stride * sizeof (FbBits) / 4;
  2107 	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
  2108 	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
  2109 	byte_width = 4 * width;
  2110 	src_stride *= 4;
  2111 	dst_stride *= 4;
  2112     }
  2113 
  2114     while (height--)
  2115     {
  2116 	int w;
  2117 	CARD8 *s = src_bytes;
  2118 	CARD8 *d = dst_bytes;
  2119 	src_bytes += src_stride;
  2120 	dst_bytes += dst_stride;
  2121 	w = byte_width;
  2122 	
  2123 	while (w >= 2 && ((unsigned long)d & 3))
  2124 	{
  2125 	    *(CARD16 *)d = *(CARD16 *)s;
  2126 	    w -= 2;
  2127 	    s += 2;
  2128 	    d += 2;
  2129 	}
  2130 	
  2131 	while (w >= 4 && ((unsigned long)d & 7))
  2132 	{
  2133 	    *(CARD32 *)d = *(CARD32 *)s;
  2134 	    
  2135 	    w -= 4;
  2136 	    s += 4;
  2137 	    d += 4;
  2138 	}
  2139 	
  2140 	while (w >= 64)
  2141 	{
  2142 	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
  2143 	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
  2144 	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
  2145 	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
  2146 	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
  2147 	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
  2148 	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
  2149 	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
  2150 	    w -= 64;
  2151 	    s += 64;
  2152 	    d += 64;
  2153 	}
  2154 	while (w >= 4)
  2155 	{
  2156 	    *(CARD32 *)d = *(CARD32 *)s;
  2157 
  2158 	    w -= 4;
  2159 	    s += 4;
  2160 	    d += 4;
  2161 	}
  2162 	if (w >= 2)
  2163 	{
  2164 	    *(CARD16 *)d = *(CARD16 *)s;
  2165 	    w -= 2;
  2166 	    s += 2;
  2167 	    d += 2;
  2168 	}
  2169     }
  2170     
  2171     _mm_empty();
  2172     return TRUE;
  2173 }
  2174 
  2175 void
  2176 fbCompositeCopyAreammx (CARD8		op,
  2177 			PicturePtr	pSrc,
  2178 			PicturePtr	pMask,
  2179 			PicturePtr	pDst,
  2180 			INT16		xSrc,
  2181 			INT16		ySrc,
  2182 			INT16		xMask,
  2183 			INT16		yMask,
  2184 			INT16		xDst,
  2185 			INT16		yDst,
  2186 			CARD16		width,
  2187 			CARD16		height)
  2188 {
  2189     fbCopyAreammx (pSrc->pDrawable,
  2190 		   pDst->pDrawable,
  2191 		   xSrc, ySrc,
  2192 		   xDst, yDst,
  2193 		   width, height);
  2194 }
  2195 
  2196 #if !defined(__amd64__) && !defined(__x86_64__)
  2197 
  2198 enum CPUFeatures {
  2199     NoFeatures = 0,
  2200     MMX = 0x1,
  2201     MMX_Extensions = 0x2, 
  2202     SSE = 0x6,
  2203     SSE2 = 0x8,
  2204     CMOV = 0x10
  2205 };
  2206 
  2207 static unsigned int detectCPUFeatures(void) {
  2208     unsigned int result;
  2209     char vendor[13];
  2210     vendor[0] = 0;
  2211     vendor[12] = 0;
  2212     /* see p. 118 of amd64 instruction set manual Vol3 */
  2213     __asm__ ("push %%ebx\n"
  2214              "pushf\n"
  2215              "pop %%eax\n"
  2216              "mov %%eax, %%ebx\n"
  2217              "xor $0x00200000, %%eax\n"
  2218              "push %%eax\n"
  2219              "popf\n"
  2220              "pushf\n"
  2221              "pop %%eax\n"
  2222              "mov $0x0, %%edx\n"
  2223              "xor %%ebx, %%eax\n"
  2224              "jz skip\n"
  2225 
  2226              "mov $0x00000000, %%eax\n"
  2227              "cpuid\n"
  2228              "mov %%ebx, %1\n"
  2229              "mov %%edx, %2\n"
  2230              "mov %%ecx, %3\n"
  2231              "mov $0x00000001, %%eax\n"
  2232              "cpuid\n"
  2233              "skip:\n"
  2234              "pop %%ebx\n"
  2235              "mov %%edx, %0\n"
  2236              : "=r" (result), 
  2237                "=m" (vendor[0]), 
  2238                "=m" (vendor[4]), 
  2239                "=m" (vendor[8])
  2240              :
  2241              : "%eax", "%ecx", "%edx"
  2242         );
  2243 
  2244     unsigned int features = 0;
  2245     if (result) {
  2246         /* result now contains the standard feature bits */
  2247         if (result & (1 << 15))
  2248             features |= CMOV;
  2249         if (result & (1 << 23))
  2250             features |= MMX;
  2251         if (result & (1 << 25))
  2252             features |= SSE;
  2253         if (result & (1 << 26))
  2254             features |= SSE2;
  2255         if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
  2256             /* check for AMD MMX extensions */
  2257 
  2258             unsigned int result;            
  2259             __asm__("push %%ebx\n"
  2260                     "mov $0x80000000, %%eax\n"
  2261                     "cpuid\n"
  2262                     "xor %%edx, %%edx\n"
  2263                     "cmp $0x1, %%eax\n"
  2264                     "jge skip2\n"
  2265                     "mov $0x80000001, %%eax\n"
  2266                     "cpuid\n"
  2267                     "skip2:\n"
  2268                     "mov %%edx, %0\n"
  2269                     "pop %%ebx\n"
  2270                     : "=r" (result)
  2271                     :
  2272                     : "%eax", "%ecx", "%edx"
  2273                 );
  2274             if (result & (1<<22))
  2275                 features |= MMX_Extensions;
  2276         }
  2277     }
  2278     return features;
  2279 }
  2280 
  2281 Bool
  2282 fbHaveMMX (void)
  2283 {
  2284     static Bool initialized = FALSE;
  2285     static Bool mmx_present;
  2286     
  2287     if (!initialized)
  2288     {
  2289         unsigned int features = detectCPUFeatures();
  2290 	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
  2291         initialized = TRUE;
  2292     }
  2293     
  2294     return mmx_present;
  2295 }
  2296 #endif /* __amd64__ */
  2297 
  2298 
  2299 #endif
  2300 
  2301 
  2302 #ifdef	__SYMBIAN32__
  2303  
  2304 OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() {
  2305 		return &_oil_function_impl_mmxCombineOverU, composite_over_argb;
  2306 }
  2307 #endif
  2308 
  2309 #ifdef	__SYMBIAN32__
  2310  
  2311 OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() {
  2312 		return &_oil_function_impl_mmxCombineAddU, composite_add_argb;
  2313 }
  2314 #endif
  2315 
  2316 #ifdef	__SYMBIAN32__
  2317  
  2318 OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() {
  2319 		return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src;
  2320 }
  2321 #endif
  2322 
  2323 #ifdef	__SYMBIAN32__
  2324  
  2325 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() {
  2326 		return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8;
  2327 }
  2328 #endif
  2329 
  2330 #ifdef	__SYMBIAN32__
  2331  
  2332 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() {
  2333 		return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb;
  2334 }
  2335 #endif
  2336