os/ossrv/genericopenlibs/liboil/src/fb/fbmmx.c
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/fb/fbmmx.c	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,2336 @@
     1.4 +/*
     1.5 + * Copyright © 2004 Red Hat, Inc.
     1.6 + * Copyright © 2004 Nicholas Miell
     1.7 + * Copyright © 2005 Trolltech AS
     1.8 + *
     1.9 + * Permission to use, copy, modify, distribute, and sell this software and its
    1.10 + * documentation for any purpose is hereby granted without fee, provided that
    1.11 + * the above copyright notice appear in all copies and that both that
    1.12 + * copyright notice and this permission notice appear in supporting
    1.13 + * documentation, and that the name of Red Hat not be used in advertising or
    1.14 + * publicity pertaining to distribution of the software without specific,
    1.15 + * written prior permission.  Red Hat makes no representations about the
    1.16 + * suitability of this software for any purpose.  It is provided "as is"
    1.17 + * without express or implied warranty.
    1.18 + *
    1.19 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    1.20 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.21 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    1.22 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    1.23 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    1.24 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    1.25 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    1.26 + * SOFTWARE.
    1.27 + *
    1.28 + * Author:  Søren Sandmann (sandmann@redhat.com)
    1.29 + * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
    1.30 + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 
    1.31 + *
    1.32 + * Based on work by Owen Taylor
    1.33 + */
    1.34 +//Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
    1.35 +
    1.36 +#ifdef HAVE_CONFIG_H
    1.37 +#include "config.h"
    1.38 +#endif
    1.39 +
    1.40 +#include <liboil/liboil.h>
    1.41 +#include <liboil/liboilfunction.h>
    1.42 + 
    1.43 +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
    1.44 +
    1.45 +typedef uint32_t CARD32;
    1.46 +typedef uint16_t CARD16;
    1.47 +typedef int16_t INT16;
    1.48 +typedef uint8_t CARD8;
    1.49 +typedef uint64_t ullong;
    1.50 +typedef CARD32* PicturePtr;
    1.51 +typedef CARD32* FbBits;
    1.52 +typedef int FbStride;
    1.53 +
    1.54 +
    1.55 +#include "fbmmx.h"
    1.56 +#include "fbpict.h"
    1.57 +
    1.58 +#define CHECKPOINT()
    1.59 +
    1.60 +OIL_DECLARE_CLASS (composite_in_argb);
    1.61 +OIL_DECLARE_CLASS (composite_in_argb_const_src);
    1.62 +OIL_DECLARE_CLASS (composite_in_argb_const_mask);
    1.63 +OIL_DECLARE_CLASS (composite_over_argb);
    1.64 +OIL_DECLARE_CLASS (composite_over_argb_const_src);
    1.65 +OIL_DECLARE_CLASS (composite_add_argb);
    1.66 +OIL_DECLARE_CLASS (composite_add_argb_const_src);
    1.67 +OIL_DECLARE_CLASS (composite_in_over_argb);
    1.68 +OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
    1.69 +OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
    1.70 +OIL_DECLARE_CLASS (composite_over_u8);
    1.71 +OIL_DECLARE_CLASS (composite_add_u8);
    1.72 +
    1.73 +
    1.74 +/* --------------- MMX code patch for fbcompose.c --------------------- */
    1.75 +
    1.76 +#if 0
    1.77 +static void
    1.78 +mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
    1.79 +{
    1.80 +    const __m64 mmx_0 = _mm_setzero_si64();
    1.81 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
    1.82 +    
    1.83 +    const uint32_t *end = mask + width;
    1.84 +    while (mask < end) {
    1.85 +        __m64 a = MmxTo(*mask);
    1.86 +        __m64 s = MmxTo(*src);
    1.87 +        a = MmxAlpha(a);
    1.88 +        MmxMul(s, a);
    1.89 +        *dest = MmxFrom(s);
    1.90 +        ++src;
    1.91 +        ++dest;
    1.92 +        ++mask;
    1.93 +    }
    1.94 +    _mm_empty();
    1.95 +}
    1.96 +#endif
    1.97 +
    1.98 +#ifdef ENABLE_BROKEN_IMPLS
    1.99 +static void
   1.100 +mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
   1.101 +{
   1.102 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.103 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.104 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.105 +
   1.106 +    const uint32_t *end = dest + width;
   1.107 +
   1.108 +    while (dest < end) {
   1.109 +        __m64 x, y, a;
   1.110 +        x = MmxTo(*src);
   1.111 +        y = MmxTo(*dest);
   1.112 +        a = MmxAlpha(x);
   1.113 +        a = MmxNegate(a);
   1.114 +        MmxMulAdd(y, a, x);
   1.115 +        *dest = MmxFrom(y);
   1.116 +        ++dest;
   1.117 +        ++src;
   1.118 +    }
   1.119 +    _mm_empty();
   1.120 +}
   1.121 +OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
   1.122 +#endif
   1.123 +
   1.124 +#if 0
   1.125 +static FASTCALL void
   1.126 +mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
   1.127 +{
   1.128 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.129 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.130 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.131 +
   1.132 +    const CARD32 *end = dest + width;
   1.133 +
   1.134 +    while (dest < end) {
   1.135 +        __m64 x, y, a;
   1.136 +        x = MmxTo(*dest);
   1.137 +        y = MmxTo(*src);
   1.138 +        a = MmxAlpha(x);
   1.139 +        a = MmxNegate(a);
   1.140 +        MmxMulAdd(y, a, x);
   1.141 +        *dest = MmxFrom(y);
   1.142 +        ++dest;
   1.143 +        ++src;
   1.144 +    }
   1.145 +    _mm_empty();
   1.146 +}
   1.147 +#endif
   1.148 +
   1.149 +#if 0
   1.150 +static void
   1.151 +mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
   1.152 +{
   1.153 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.154 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.155 +
   1.156 +    const CARD32 *end = dest + width;
   1.157 +
   1.158 +    while (dest < end) {
   1.159 +        __m64 x, a;
   1.160 +        x = MmxTo(*src);
   1.161 +        a = MmxTo(*dest);
   1.162 +        a = MmxAlpha(a);
   1.163 +        MmxMul(x, a);
   1.164 +        *dest = MmxFrom(x);
   1.165 +        ++dest;
   1.166 +        ++src;
   1.167 +    }
   1.168 +    _mm_empty();
   1.169 +}
   1.170 +#endif
   1.171 +
   1.172 +#if 0
   1.173 +static FASTCALL void
   1.174 +mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
   1.175 +{
   1.176 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.177 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.178 +
   1.179 +    const CARD32 *end = dest + width;
   1.180 +
   1.181 +    while (dest < end) {
   1.182 +        __m64 x, a;
   1.183 +        x = MmxTo(*dest);
   1.184 +        a = MmxTo(*src);
   1.185 +        a = MmxAlpha(a);
   1.186 +        MmxMul(x, a);
   1.187 +        *dest = MmxFrom(x);
   1.188 +        ++dest;
   1.189 +        ++src;
   1.190 +    }
   1.191 +    _mm_empty();
   1.192 +}
   1.193 +#endif
   1.194 +
   1.195 +#if 0
   1.196 +static FASTCALL void
   1.197 +mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
   1.198 +{
   1.199 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.200 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.201 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.202 +
   1.203 +    const CARD32 *end = dest + width;
   1.204 +
   1.205 +    while (dest < end) {
   1.206 +        __m64 x, a;
   1.207 +        x = MmxTo(*src);
   1.208 +        a = MmxTo(*dest);
   1.209 +        a = MmxAlpha(a);
   1.210 +        a = MmxNegate(a);
   1.211 +        MmxMul(x, a);
   1.212 +        *dest = MmxFrom(x);
   1.213 +        ++dest;
   1.214 +        ++src;
   1.215 +    }
   1.216 +    _mm_empty();
   1.217 +}
   1.218 +#endif
   1.219 +
   1.220 +#if 0
   1.221 +static FASTCALL void
   1.222 +mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
   1.223 +{
   1.224 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.225 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.226 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.227 +
   1.228 +    const CARD32 *end = dest + width;
   1.229 +
   1.230 +    while (dest < end) {
   1.231 +        __m64 x, a;
   1.232 +        x = MmxTo(*dest);
   1.233 +        a = MmxTo(*src);
   1.234 +        a = MmxAlpha(a);
   1.235 +        a = MmxNegate(a);
   1.236 +        MmxMul(x, a);
   1.237 +        *dest = MmxFrom(x);
   1.238 +        ++dest;
   1.239 +        ++src;
   1.240 +    }
   1.241 +    _mm_empty();
   1.242 +}
   1.243 +
   1.244 +static FASTCALL void
   1.245 +mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
   1.246 +{
   1.247 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.248 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.249 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.250 +
   1.251 +    const CARD32 *end = dest + width;
   1.252 +
   1.253 +    while (dest < end) {
   1.254 +        __m64 s, da, d, sia;
   1.255 +        s = MmxTo(*src);
   1.256 +        d = MmxTo(*dest);
   1.257 +        sia = MmxAlpha(s);
   1.258 +        sia = MmxNegate(sia);
   1.259 +        da = MmxAlpha(d);
   1.260 +        MmxAddMul(s, da, d, sia);
   1.261 +        *dest = MmxFrom(s);
   1.262 +        ++dest;
   1.263 +        ++src;
   1.264 +    }
   1.265 +    _mm_empty();
   1.266 +}
   1.267 +
   1.268 +static FASTCALL void
   1.269 +mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
   1.270 +{
   1.271 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.272 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.273 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.274 +
   1.275 +    const CARD32 *end;
   1.276 +
   1.277 +    end = dest + width;
   1.278 +
   1.279 +    while (dest < end) {
   1.280 +        __m64 s, dia, d, sa;
   1.281 +        s = MmxTo(*src);
   1.282 +        d = MmxTo(*dest);
   1.283 +        sa = MmxAlpha(s);
   1.284 +        dia = MmxAlpha(d);
   1.285 +        dia = MmxNegate(dia);
   1.286 +        MmxAddMul(s, dia, d, sa);
   1.287 +        *dest = MmxFrom(s);
   1.288 +        ++dest;
   1.289 +        ++src;
   1.290 +    }
   1.291 +    _mm_empty();
   1.292 +}
   1.293 +
   1.294 +static FASTCALL void
   1.295 +mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
   1.296 +{
   1.297 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.298 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.299 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.300 +
   1.301 +    const CARD32 *end = dest + width;
   1.302 +
   1.303 +    while (dest < end) {
   1.304 +        __m64 s, dia, d, sia;
   1.305 +        s = MmxTo(*src);
   1.306 +        d = MmxTo(*dest);
   1.307 +        sia = MmxAlpha(s);
   1.308 +        dia = MmxAlpha(d);
   1.309 +        sia = MmxNegate(sia);
   1.310 +        dia = MmxNegate(dia);
   1.311 +        MmxAddMul(s, dia, d, sia);
   1.312 +        *dest = MmxFrom(s);
   1.313 +        ++dest;
   1.314 +        ++src;
   1.315 +    }
   1.316 +    _mm_empty();
   1.317 +}
   1.318 +#endif
   1.319 +
   1.320 +static void
   1.321 +mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
   1.322 +{
   1.323 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.324 +
   1.325 +    const uint32_t *end = dest + width;
   1.326 +    while (dest < end) {
   1.327 +        __m64 s, d;
   1.328 +        s = MmxTo(*src);
   1.329 +        d = MmxTo(*dest);
   1.330 +        s = MmxAdd(s, d);
   1.331 +        *dest = MmxFrom(s);
   1.332 +        ++dest;
   1.333 +        ++src;
   1.334 +    }
   1.335 +    _mm_empty();
   1.336 +}
   1.337 +OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
   1.338 +
   1.339 +#if 0
   1.340 +static FASTCALL void
   1.341 +mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
   1.342 +{
   1.343 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.344 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.345 +
   1.346 +    const CARD32 *end = dest + width;
   1.347 +    while (dest < end) {
   1.348 +        CARD32 s = *src;
   1.349 +        CARD32 d = *dest;
   1.350 +        __m64 ms = MmxTo(s);
   1.351 +        __m64 md = MmxTo(d);
   1.352 +        CARD32 sa = s >> 24;
   1.353 +        CARD32 da = ~d >> 24;
   1.354 +
   1.355 +        if (sa > da) {
   1.356 +            __m64 msa = MmxTo(FbIntDiv(da, sa));
   1.357 +            msa = MmxAlpha(msa);
   1.358 +            MmxMul(ms, msa);
   1.359 +        }
   1.360 +        MmxAdd(md, ms);
   1.361 +        *dest = MmxFrom(md);
   1.362 +        ++src;
   1.363 +        ++dest;
   1.364 +    }
   1.365 +    _mm_empty();
   1.366 +}
   1.367 +
   1.368 +
   1.369 +static FASTCALL void
   1.370 +mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.371 +{
   1.372 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.373 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.374 +
   1.375 +    const CARD32 *end = src + width;
   1.376 +    while (src < end) {
   1.377 +        __m64 a = MmxTo(*mask);
   1.378 +        __m64 s = MmxTo(*src);
   1.379 +        MmxMul(s, a);
   1.380 +        *dest = MmxFrom(s);
   1.381 +        ++src;
   1.382 +        ++mask;
   1.383 +        ++dest;
   1.384 +    }
   1.385 +    _mm_empty();
   1.386 +}
   1.387 +
   1.388 +static FASTCALL void
   1.389 +mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.390 +{
   1.391 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.392 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.393 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.394 +    
   1.395 +    const CARD32 *end = src + width;
   1.396 +    while (src < end) {
   1.397 +        __m64 a = MmxTo(*mask);
   1.398 +        __m64 s = MmxTo(*src);
   1.399 +        __m64 d = MmxTo(*dest);
   1.400 +        __m64 sa = MmxAlpha(s);
   1.401 +        MmxMul(s, a);
   1.402 +        MmxMul(a, sa);
   1.403 +        a = MmxNegate(a);
   1.404 +        MmxMulAdd(d, a, s);
   1.405 +        *dest = MmxFrom(d);
   1.406 +        ++src;
   1.407 +        ++dest;
   1.408 +        ++mask;
   1.409 +    }
   1.410 +    _mm_empty();
   1.411 +}
   1.412 +
   1.413 +static FASTCALL void
   1.414 +mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.415 +{
   1.416 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.417 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.418 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.419 +    
   1.420 +    const CARD32 *end = src + width;
   1.421 +    while (src < end) {
   1.422 +        __m64 a = MmxTo(*mask);
   1.423 +        __m64 s = MmxTo(*src);
   1.424 +        __m64 d = MmxTo(*dest);
   1.425 +        __m64 da = MmxAlpha(d);
   1.426 +        da = MmxNegate(da);
   1.427 +        MmxMul(s, a);
   1.428 +        MmxMulAdd(s, da, d);
   1.429 +        *dest = MmxFrom(s);
   1.430 +        ++src;
   1.431 +        ++dest;
   1.432 +        ++mask;
   1.433 +    }
   1.434 +    _mm_empty();
   1.435 +}
   1.436 +
   1.437 +
   1.438 +static FASTCALL void
   1.439 +mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.440 +{
   1.441 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.442 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.443 +    
   1.444 +    const CARD32 *end = src + width;
   1.445 +    while (src < end) {
   1.446 +        __m64 a = MmxTo(*mask);
   1.447 +        __m64 s = MmxTo(*src);
   1.448 +        __m64 d = MmxTo(*dest);
   1.449 +        __m64 da = MmxAlpha(d);
   1.450 +        MmxMul(s, a);
   1.451 +        MmxMul(s, da);
   1.452 +        *dest = MmxFrom(s);
   1.453 +        ++src;
   1.454 +        ++dest;
   1.455 +        ++mask;
   1.456 +    }
   1.457 +    _mm_empty();
   1.458 +}
   1.459 +
   1.460 +static FASTCALL void
   1.461 +mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.462 +{
   1.463 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.464 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.465 +    
   1.466 +    const CARD32 *end = src + width;
   1.467 +    while (src < end) {
   1.468 +        __m64 a = MmxTo(*mask);
   1.469 +        __m64 s = MmxTo(*src);
   1.470 +        __m64 d = MmxTo(*dest);
   1.471 +        __m64 sa = MmxAlpha(s);
   1.472 +        MmxMul(a, sa);
   1.473 +        MmxMul(d, a);
   1.474 +        *dest = MmxFrom(d);
   1.475 +        ++src;
   1.476 +        ++dest;
   1.477 +        ++mask;
   1.478 +    }
   1.479 +    _mm_empty();
   1.480 +}
   1.481 +
   1.482 +static FASTCALL void
   1.483 +mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.484 +{
   1.485 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.486 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.487 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.488 +    
   1.489 +    const CARD32 *end = src + width;
   1.490 +    while (src < end) {
   1.491 +        __m64 a = MmxTo(*mask);
   1.492 +        __m64 s = MmxTo(*src);
   1.493 +        __m64 d = MmxTo(*dest);
   1.494 +        __m64 da = MmxAlpha(d);
   1.495 +        da = MmxNegate(da);
   1.496 +        MmxMul(s, a);
   1.497 +        MmxMul(s, da);
   1.498 +        *dest = MmxFrom(s);
   1.499 +        ++src;
   1.500 +        ++dest;
   1.501 +        ++mask;
   1.502 +    }
   1.503 +    _mm_empty();
   1.504 +}
   1.505 +
   1.506 +static FASTCALL void
   1.507 +mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.508 +{
   1.509 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.510 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.511 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.512 +    
   1.513 +    const CARD32 *end = src + width;
   1.514 +    while (src < end) {
   1.515 +        __m64 a = MmxTo(*mask);
   1.516 +        __m64 s = MmxTo(*src);
   1.517 +        __m64 d = MmxTo(*dest);
   1.518 +        __m64 sa = MmxAlpha(s);
   1.519 +        MmxMul(a, sa);
   1.520 +        a = MmxNegate(a);
   1.521 +        MmxMul(d, a);
   1.522 +        *dest = MmxFrom(d);
   1.523 +        ++src;
   1.524 +        ++dest;
   1.525 +        ++mask;
   1.526 +    }
   1.527 +    _mm_empty();
   1.528 +}
   1.529 +
   1.530 +static FASTCALL void
   1.531 +mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.532 +{
   1.533 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.534 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.535 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.536 +    
   1.537 +    const CARD32 *end = src + width;
   1.538 +    while (src < end) {
   1.539 +        __m64 a = MmxTo(*mask);
   1.540 +        __m64 s = MmxTo(*src);
   1.541 +        __m64 d = MmxTo(*dest);
   1.542 +        __m64 da = MmxAlpha(d);
   1.543 +        __m64 sa = MmxAlpha(s); 
   1.544 +        MmxMul(s, a);
   1.545 +        MmxMul(a, sa);
   1.546 +        a = MmxNegate(a);
   1.547 +        MmxAddMul(d, a, s, da);
   1.548 +        *dest = MmxFrom(d);
   1.549 +        ++src;
   1.550 +        ++dest;
   1.551 +        ++mask;
   1.552 +    }
   1.553 +    _mm_empty();
   1.554 +}
   1.555 +
   1.556 +static FASTCALL void
   1.557 +mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.558 +{
   1.559 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.560 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.561 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.562 +    
   1.563 +    const CARD32 *end = src + width;
   1.564 +    while (src < end) {
   1.565 +        __m64 a = MmxTo(*mask);
   1.566 +        __m64 s = MmxTo(*src);
   1.567 +        __m64 d = MmxTo(*dest);
   1.568 +        __m64 da = MmxAlpha(d);
   1.569 +        __m64 sa = MmxAlpha(s)
   1.570 +        MmxMul(s, a);
   1.571 +        MmxMul(a, sa);
   1.572 +        da = MmxNegate(da);
   1.573 +        MmxAddMul(d, a, s, da);
   1.574 +        *dest = MmxFrom(d);
   1.575 +        ++src;
   1.576 +        ++dest;
   1.577 +        ++mask;
   1.578 +    }
   1.579 +    _mm_empty();
   1.580 +}
   1.581 +
   1.582 +static FASTCALL void
   1.583 +mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.584 +{
   1.585 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.586 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.587 +    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
   1.588 +    
   1.589 +    const CARD32 *end = src + width;
   1.590 +    while (src < end) {
   1.591 +        __m64 a = MmxTo(*mask);
   1.592 +        __m64 s = MmxTo(*src);
   1.593 +        __m64 d = MmxTo(*dest);
   1.594 +        __m64 da = MmxAlpha(d);
   1.595 +        __m64 sa = MmxAlpha(s);
   1.596 +        MmxMul(s, a);
   1.597 +        MmxMul(a, sa);
   1.598 +        da = MmxNegate(da);
   1.599 +        a = MmxNegate(a);
   1.600 +        MmxAddMul(d, a, s, da);
   1.601 +        *dest = MmxFrom(d);
   1.602 +        ++src;
   1.603 +        ++dest;
   1.604 +        ++mask;
   1.605 +    }
   1.606 +    _mm_empty();
   1.607 +}
   1.608 +
   1.609 +static FASTCALL void
   1.610 +mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
   1.611 +{
   1.612 +    const __m64 mmx_0 = _mm_setzero_si64();
   1.613 +    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
   1.614 +    
   1.615 +    const CARD32 *end = src + width;
   1.616 +    while (src < end) {
   1.617 +        __m64 a = MmxTo(*mask);
   1.618 +        __m64 s = MmxTo(*src);
   1.619 +        __m64 d = MmxTo(*dest);
   1.620 +        MmxMul(s, a);
   1.621 +        d = MmxAdd(s, d);
   1.622 +        *dest = MmxFrom(d);
   1.623 +        ++src;
   1.624 +        ++dest;
   1.625 +        ++mask;
   1.626 +    }
   1.627 +    _mm_empty();
   1.628 +}
   1.629 +
   1.630 +extern FbComposeFunctions composeFunctions;
   1.631 +
   1.632 +void fbComposeSetupMMX(void)
   1.633 +{
   1.634 +    /* check if we have MMX support and initialize accordingly */
   1.635 +    if (fbHaveMMX()) {
   1.636 +        composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
   1.637 +        composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
   1.638 +        composeFunctions.combineU[PictOpIn] = mmxCombineInU;
   1.639 +        composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
   1.640 +        composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
   1.641 +        composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
   1.642 +        composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
   1.643 +        composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
   1.644 +        composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
   1.645 +        composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
   1.646 +        composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
   1.647 +
   1.648 +        composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
   1.649 +        composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
   1.650 +        composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
   1.651 +        composeFunctions.combineC[PictOpIn] = mmxCombineInC;
   1.652 +        composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
   1.653 +        composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
   1.654 +        composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
   1.655 +        composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
   1.656 +        composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
   1.657 +        composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
   1.658 +        composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
   1.659 +
   1.660 +        composeFunctions.combineMaskU = mmxCombineMaskU;
   1.661 +    } 
   1.662 +}
   1.663 +#endif
   1.664 +
   1.665 +
   1.666 +/* ------------------ MMX code paths called from fbpict.c ----------------------- */
   1.667 +
   1.668 +typedef union {
   1.669 +  __m64 m64;
   1.670 +  uint64_t ull;
   1.671 +} m64_ull;
   1.672 +
   1.673 +typedef struct
   1.674 +{
   1.675 +    m64_ull mmx_4x00ff;
   1.676 +    m64_ull mmx_4x0080;
   1.677 +    m64_ull mmx_565_rgb;
   1.678 +    m64_ull mmx_565_unpack_multiplier;
   1.679 +    m64_ull mmx_565_r;
   1.680 +    m64_ull mmx_565_g;
   1.681 +    m64_ull mmx_565_b;
   1.682 +    m64_ull mmx_mask_0;
   1.683 +    m64_ull mmx_mask_1;
   1.684 +    m64_ull mmx_mask_2;
   1.685 +    m64_ull mmx_mask_3;
   1.686 +    m64_ull mmx_full_alpha;
   1.687 +    m64_ull mmx_ffff0000ffff0000;
   1.688 +    m64_ull mmx_0000ffff00000000;
   1.689 +    m64_ull mmx_000000000000ffff;
   1.690 +} MMXData;
   1.691 +
   1.692 +static const MMXData c =
   1.693 +{
   1.694 +    .mmx_4x00ff.ull =			0x00ff00ff00ff00ffULL,
   1.695 +    .mmx_4x0080.ull =			0x0080008000800080ULL,
   1.696 +    .mmx_565_rgb.ull =			0x000001f0003f001fULL,
   1.697 +    .mmx_565_r.ull =			0x000000f800000000ULL,
   1.698 +    .mmx_565_g.ull =			0x0000000000fc0000ULL,
   1.699 +    .mmx_565_b.ull =			0x00000000000000f8ULL,
   1.700 +    .mmx_mask_0.ull =			0xffffffffffff0000ULL,
   1.701 +    .mmx_mask_1.ull =			0xffffffff0000ffffULL,
   1.702 +    .mmx_mask_2.ull =			0xffff0000ffffffffULL,
   1.703 +    .mmx_mask_3.ull =			0x0000ffffffffffffULL,
   1.704 +    .mmx_full_alpha.ull =			0x00ff000000000000ULL,
   1.705 +    .mmx_565_unpack_multiplier.ull =	0x0000008404100840ULL,
   1.706 +    .mmx_ffff0000ffff0000.ull =		0xffff0000ffff0000ULL,
   1.707 +    .mmx_0000ffff00000000.ull =		0x0000ffff00000000ULL,
   1.708 +    .mmx_000000000000ffff.ull =		0x000000000000ffffULL,
   1.709 +};
   1.710 +
   1.711 +#define MC(x) ((__m64) c.mmx_##x.m64)
   1.712 +
   1.713 +static __inline__ __m64
   1.714 +shift (__m64 v, int s)
   1.715 +{
   1.716 +    if (s > 0)
   1.717 +	return _mm_slli_si64 (v, s);
   1.718 +    else if (s < 0)
   1.719 +	return _mm_srli_si64 (v, -s);
   1.720 +    else
   1.721 +	return v;
   1.722 +}
   1.723 +
   1.724 +static __inline__ __m64
   1.725 +negate (__m64 mask)
   1.726 +{
   1.727 +    return _mm_xor_si64 (mask, MC(4x00ff));
   1.728 +}
   1.729 +
   1.730 +static __inline__ __m64
   1.731 +pix_multiply (__m64 a, __m64 b)
   1.732 +{
   1.733 +    __m64 res;
   1.734 +    
   1.735 +    res = _mm_mullo_pi16 (a, b);
   1.736 +    res = _mm_adds_pu16 (res, MC(4x0080));
   1.737 +    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
   1.738 +    res = _mm_srli_pi16 (res, 8);
   1.739 +    
   1.740 +    return res;
   1.741 +}
   1.742 +
   1.743 +static __inline__ __m64
   1.744 +expand_alpha (__m64 pixel)
   1.745 +{
   1.746 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
   1.747 +}
   1.748 +
   1.749 +static __inline__ __m64
   1.750 +expand_alpha_rev (__m64 pixel)
   1.751 +{
   1.752 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
   1.753 +}    
   1.754 +
   1.755 +static __inline__ __m64
   1.756 +invert_colors (__m64 pixel)
   1.757 +{
   1.758 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
   1.759 +}
   1.760 +
   1.761 +/* Notes about writing mmx code
   1.762 + *
   1.763 + * give memory operands as the second operand. If you give it as the
   1.764 + * first, gcc will first load it into a register, then use that
   1.765 + * register
   1.766 + *
   1.767 + *   ie. use
   1.768 + *
   1.769 + *         _mm_mullo_pi16 (x, mmx_constant);
   1.770 + *
   1.771 + *   not
   1.772 + *
   1.773 + *         _mm_mullo_pi16 (mmx_constant, x);
   1.774 + *
   1.775 + * Also try to minimize dependencies. i.e. when you need a value, try
   1.776 + * to calculate it from a value that was calculated as early as
   1.777 + * possible.
   1.778 + */
   1.779 +
   1.780 +static __inline__ __m64
   1.781 +over (__m64 src, __m64 srca, __m64 dest)
   1.782 +{
   1.783 +    return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
   1.784 +}
   1.785 +
   1.786 +static __inline__ __m64
   1.787 +over_rev_non_pre (__m64 src, __m64 dest)
   1.788 +{
   1.789 +    __m64 srca = expand_alpha (src);
   1.790 +    __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
   1.791 +    
   1.792 +    return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
   1.793 +}
   1.794 +
   1.795 +static __inline__ __m64
   1.796 +in (__m64 src,
   1.797 +    __m64 mask)
   1.798 +{
   1.799 +    return pix_multiply (src, mask);
   1.800 +}
   1.801 +
   1.802 +static __inline__ __m64
   1.803 +in_over (__m64 src,
   1.804 +	 __m64 srca,
   1.805 +	 __m64 mask,
   1.806 +	 __m64 dest)
   1.807 +{
   1.808 +    return over(in(src, mask), pix_multiply(srca, mask), dest);
   1.809 +}
   1.810 +
   1.811 +static __inline__ __m64
   1.812 +load8888 (CARD32 v)
   1.813 +{
   1.814 +    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
   1.815 +}
   1.816 +
   1.817 +static __inline__ __m64
   1.818 +pack8888 (__m64 lo, __m64 hi)
   1.819 +{
   1.820 +    __m64 r;
   1.821 +    r = _mm_packs_pu16 (lo, hi);
   1.822 +    return r;
   1.823 +}
   1.824 +
   1.825 +static __inline__ CARD32
   1.826 +store8888 (__m64 v)
   1.827 +{
   1.828 +    return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
   1.829 +}
   1.830 +
   1.831 +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
   1.832 + *
   1.833 + *    00RR00GG00BB
   1.834 + * 
   1.835 + * --- Expanding 565 in the low word ---
   1.836 + * 
   1.837 + * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
   1.838 + * m = m & (01f0003f001f);
   1.839 + * m = m * (008404100840);
   1.840 + * m = m >> 8;
   1.841 + * 
   1.842 + * Note the trick here - the top word is shifted by another nibble to
   1.843 + * avoid it bumping into the middle word
   1.844 + */
   1.845 +static __inline__ __m64
   1.846 +expand565 (__m64 pixel, int pos)
   1.847 +{
   1.848 +    __m64 p = pixel;
   1.849 +    __m64 t1, t2;
   1.850 +    
   1.851 +    /* move pixel to low 16 bit and zero the rest */
   1.852 +    p = shift (shift (p, (3 - pos) * 16), -48); 
   1.853 +    
   1.854 +    t1 = shift (p, 36 - 11);
   1.855 +    t2 = shift (p, 16 - 5);
   1.856 +    
   1.857 +    p = _mm_or_si64 (t1, p);
   1.858 +    p = _mm_or_si64 (t2, p);
   1.859 +    p = _mm_and_si64 (p, MC(565_rgb));
   1.860 +    
   1.861 +    pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
   1.862 +    return _mm_srli_pi16 (pixel, 8);
   1.863 +}
   1.864 +
   1.865 +static __inline__ __m64
   1.866 +expand8888 (__m64 in, int pos)
   1.867 +{
   1.868 +    if (pos == 0)
   1.869 +	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
   1.870 +    else
   1.871 +	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
   1.872 +}
   1.873 +
   1.874 +static __inline__ __m64
   1.875 +pack565 (__m64 pixel, __m64 target, int pos)
   1.876 +{
   1.877 +    __m64 p = pixel;
   1.878 +    __m64 t = target;
   1.879 +    __m64 r, g, b;
   1.880 +    
   1.881 +    r = _mm_and_si64 (p, MC(565_r));
   1.882 +    g = _mm_and_si64 (p, MC(565_g));
   1.883 +    b = _mm_and_si64 (p, MC(565_b));
   1.884 +    
   1.885 +    r = shift (r, - (32 - 8) + pos * 16);
   1.886 +    g = shift (g, - (16 - 3) + pos * 16);
   1.887 +    b = shift (b, - (0  + 3) + pos * 16);
   1.888 +    
   1.889 +    if (pos == 0)
   1.890 +	t = _mm_and_si64 (t, MC(mask_0));
   1.891 +    else if (pos == 1)
   1.892 +	t = _mm_and_si64 (t, MC(mask_1));
   1.893 +    else if (pos == 2)
   1.894 +	t = _mm_and_si64 (t, MC(mask_2));
   1.895 +    else if (pos == 3)
   1.896 +	t = _mm_and_si64 (t, MC(mask_3));
   1.897 +    
   1.898 +    p = _mm_or_si64 (r, t);
   1.899 +    p = _mm_or_si64 (g, p);
   1.900 +    
   1.901 +    return _mm_or_si64 (b, p);
   1.902 +}
   1.903 +
   1.904 +#ifdef ENABLE_BROKEN_IMPLS
   1.905 +/* broken.  See Debian bug #340932 */
   1.906 +static void
   1.907 +fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
   1.908 +{
   1.909 +    __m64	vsrc, vsrca;
   1.910 +
   1.911 +    vsrc = load8888 (*src);
   1.912 +    vsrca = expand_alpha (vsrc);
   1.913 +
   1.914 +    while (w && (unsigned long)dst & 7)
   1.915 +    {
   1.916 +        *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
   1.917 +        
   1.918 +        w--;
   1.919 +        dst++;
   1.920 +    }
   1.921 +    
   1.922 +    while (w >= 2)
   1.923 +    {
   1.924 +        __m64 vdest;
   1.925 +        __m64 dest0, dest1;
   1.926 +        
   1.927 +        vdest = *(__m64 *)dst;
   1.928 +        
   1.929 +        dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
   1.930 +        dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
   1.931 +        
   1.932 +        *(__m64 *)dst = pack8888(dest0, dest1);
   1.933 +        
   1.934 +        dst += 2;
   1.935 +        w -= 2;
   1.936 +    }
   1.937 +    
   1.938 +    while (w)
   1.939 +    {
   1.940 +        *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
   1.941 +        
   1.942 +        w--;
   1.943 +        dst++;
   1.944 +    }
   1.945 +    
   1.946 +    _mm_empty();
   1.947 +}
   1.948 +OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
   1.949 +    OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
   1.950 +#endif
   1.951 +
   1.952 +#if 0
   1.953 +void
   1.954 +fbCompositeSolid_nx0565mmx (CARD8	op,
   1.955 +			    PicturePtr pSrc,
   1.956 +			    PicturePtr pMask,
   1.957 +			    PicturePtr pDst,
   1.958 +			    INT16	xSrc,
   1.959 +			    INT16	ySrc,
   1.960 +			    INT16	xMask,
   1.961 +			    INT16	yMask,
   1.962 +			    INT16	xDst,
   1.963 +			    INT16	yDst,
   1.964 +			    CARD16	width,
   1.965 +			    CARD16	height)
   1.966 +{
   1.967 +    CARD32	src;
   1.968 +    CARD16	*dstLine, *dst;
   1.969 +    CARD16	w;
   1.970 +    FbStride	dstStride;
   1.971 +    __m64	vsrc, vsrca;
   1.972 +    
   1.973 +    CHECKPOINT();
   1.974 +    
   1.975 +    fbComposeGetSolid(pSrc, src, pDst->format);
   1.976 +    
   1.977 +    if (src >> 24 == 0)
   1.978 +	return;
   1.979 +    
   1.980 +    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
   1.981 +    
   1.982 +    vsrc = load8888 (src);
   1.983 +    vsrca = expand_alpha (vsrc);
   1.984 +    
   1.985 +    while (height--)
   1.986 +    {
   1.987 +	dst = dstLine;
   1.988 +	dstLine += dstStride;
   1.989 +	w = width;
   1.990 +	
   1.991 +	CHECKPOINT();
   1.992 +	
   1.993 +	while (w && (unsigned long)dst & 7)
   1.994 +	{
   1.995 +	    ullong d = *dst;
   1.996 +	    __m64 vdest = expand565 ((__m64)d, 0);
   1.997 +	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
   1.998 +	    *dst = (ullong)vdest;
   1.999 +	    
  1.1000 +	    w--;
  1.1001 +	    dst++;
  1.1002 +	}
  1.1003 +	
  1.1004 +	while (w >= 4)
  1.1005 +	{
  1.1006 +	    __m64 vdest;
  1.1007 +	    
  1.1008 +	    vdest = *(__m64 *)dst;
  1.1009 +	    
  1.1010 +	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
  1.1011 +	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
  1.1012 +	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
  1.1013 +	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
  1.1014 +	    
  1.1015 +	    *(__m64 *)dst = vdest;
  1.1016 +	    
  1.1017 +	    dst += 4;
  1.1018 +	    w -= 4;
  1.1019 +	}
  1.1020 +	
  1.1021 +	CHECKPOINT();
  1.1022 +	
  1.1023 +	while (w)
  1.1024 +	{
  1.1025 +	    ullong d = *dst;
  1.1026 +	    __m64 vdest = expand565 ((__m64)d, 0);
  1.1027 +	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
  1.1028 +	    *dst = (ullong)vdest;
  1.1029 +	    
  1.1030 +	    w--;
  1.1031 +	    dst++;
  1.1032 +	}
  1.1033 +    }
  1.1034 +    
  1.1035 +    _mm_empty();
  1.1036 +}
  1.1037 +#endif
  1.1038 +
  1.1039 +#if 0
  1.1040 +static void
  1.1041 +fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
  1.1042 +{
  1.1043 +    CARD32	src, srca;
  1.1044 +    CARD32	*dstLine;
  1.1045 +    CARD32	*maskLine;
  1.1046 +    FbStride	dstStride, maskStride;
  1.1047 +    __m64	vsrc, vsrca;
  1.1048 +    
  1.1049 +    
  1.1050 +    while (twidth && (unsigned long)q & 7)
  1.1051 +    {
  1.1052 +        CARD32 m = *(CARD32 *)p;
  1.1053 +        
  1.1054 +        if (m)
  1.1055 +        {
  1.1056 +            __m64 vdest = load8888(*q);
  1.1057 +            vdest = in_over(vsrc, vsrca, load8888(m), vdest);
  1.1058 +            *q = (ullong)pack8888(vdest, _mm_setzero_si64());
  1.1059 +        }
  1.1060 +        
  1.1061 +        twidth--;
  1.1062 +        p++;
  1.1063 +        q++;
  1.1064 +    }
  1.1065 +    
  1.1066 +    while (twidth >= 2)
  1.1067 +    {
  1.1068 +        CARD32 m0, m1;
  1.1069 +        m0 = *p;
  1.1070 +        m1 = *(p + 1);
  1.1071 +        
  1.1072 +        if (m0 | m1)
  1.1073 +        {
  1.1074 +            __m64 dest0, dest1;
  1.1075 +            __m64 vdest = *(__m64 *)q;
  1.1076 +            
  1.1077 +            dest0 = in_over(vsrc, vsrca, load8888(m0),
  1.1078 +                            expand8888 (vdest, 0));
  1.1079 +            dest1 = in_over(vsrc, vsrca, load8888(m1),
  1.1080 +                            expand8888 (vdest, 1));
  1.1081 +            
  1.1082 +            *(__m64 *)q = pack8888(dest0, dest1);
  1.1083 +        }
  1.1084 +        
  1.1085 +        p += 2;
  1.1086 +        q += 2;
  1.1087 +        twidth -= 2;
  1.1088 +    }
  1.1089 +    
  1.1090 +    while (twidth)
  1.1091 +    {
  1.1092 +        CARD32 m = *(CARD32 *)p;
  1.1093 +        
  1.1094 +        if (m)
  1.1095 +        {
  1.1096 +            __m64 vdest = load8888(*q);
  1.1097 +            vdest = in_over(vsrc, vsrca, load8888(m), vdest);
  1.1098 +            *q = (ullong)pack8888(vdest, _mm_setzero_si64());
  1.1099 +        }
  1.1100 +        
  1.1101 +        twidth--;
  1.1102 +        p++;
  1.1103 +        q++;
  1.1104 +    }
  1.1105 +    
  1.1106 +    _mm_empty();
  1.1107 +}
  1.1108 +#endif
  1.1109 +
  1.1110 +#if 0
  1.1111 +static void
  1.1112 +fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
  1.1113 +    int width)
  1.1114 +{
  1.1115 +
  1.1116 +    mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
  1.1117 +    vmask = load8888 (mask);
  1.1118 +    srca = MC(4x00ff);
  1.1119 +    
  1.1120 +    while (height--)
  1.1121 +    {
  1.1122 +	dst = dstLine;
  1.1123 +	dstLine += dstStride;
  1.1124 +	src = srcLine;
  1.1125 +	srcLine += srcStride;
  1.1126 +	w = width;
  1.1127 +
  1.1128 +	while (w && (unsigned long)dst & 7)
  1.1129 +	{
  1.1130 +	    __m64 s = load8888 (*src);
  1.1131 +	    __m64 d = load8888 (*dst);
  1.1132 +	    
  1.1133 +	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
  1.1134 +	    
  1.1135 +	    w--;
  1.1136 +	    dst++;
  1.1137 +	    src++;
  1.1138 +	}
  1.1139 +
  1.1140 +	while (w >= 16)
  1.1141 +	{
  1.1142 +	    __m64 vd0 = *(__m64 *)(dst + 0);
  1.1143 +	    __m64 vd1 = *(__m64 *)(dst + 2);
  1.1144 +	    __m64 vd2 = *(__m64 *)(dst + 4);
  1.1145 +	    __m64 vd3 = *(__m64 *)(dst + 6);
  1.1146 +	    __m64 vd4 = *(__m64 *)(dst + 8);
  1.1147 +	    __m64 vd5 = *(__m64 *)(dst + 10);
  1.1148 +	    __m64 vd6 = *(__m64 *)(dst + 12);
  1.1149 +	    __m64 vd7 = *(__m64 *)(dst + 14);
  1.1150 +
  1.1151 +	    __m64 vs0 = *(__m64 *)(src + 0);
  1.1152 +	    __m64 vs1 = *(__m64 *)(src + 2);
  1.1153 +	    __m64 vs2 = *(__m64 *)(src + 4);
  1.1154 +	    __m64 vs3 = *(__m64 *)(src + 6);
  1.1155 +	    __m64 vs4 = *(__m64 *)(src + 8);
  1.1156 +	    __m64 vs5 = *(__m64 *)(src + 10);
  1.1157 +	    __m64 vs6 = *(__m64 *)(src + 12);
  1.1158 +	    __m64 vs7 = *(__m64 *)(src + 14);
  1.1159 +
  1.1160 +	    vd0 = (__m64)pack8888 (
  1.1161 +		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
  1.1162 +		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
  1.1163 +	
  1.1164 +	    vd1 = (__m64)pack8888 (
  1.1165 +		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
  1.1166 +		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
  1.1167 +	
  1.1168 +	    vd2 = (__m64)pack8888 (
  1.1169 +		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
  1.1170 +		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
  1.1171 +	
  1.1172 +	    vd3 = (__m64)pack8888 (
  1.1173 +		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
  1.1174 +		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
  1.1175 +	
  1.1176 +	    vd4 = (__m64)pack8888 (
  1.1177 +		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
  1.1178 +		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
  1.1179 +	
  1.1180 +	    vd5 = (__m64)pack8888 (
  1.1181 +		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
  1.1182 +		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
  1.1183 +	
  1.1184 +	    vd6 = (__m64)pack8888 (
  1.1185 +		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
  1.1186 +		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
  1.1187 +	
  1.1188 +	    vd7 = (__m64)pack8888 (
  1.1189 +		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
  1.1190 +		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
  1.1191 +
  1.1192 +    	    *(__m64 *)(dst + 0) = vd0;
  1.1193 +	    *(__m64 *)(dst + 2) = vd1;
  1.1194 +	    *(__m64 *)(dst + 4) = vd2;
  1.1195 +	    *(__m64 *)(dst + 6) = vd3;
  1.1196 +	    *(__m64 *)(dst + 8) = vd4;
  1.1197 +	    *(__m64 *)(dst + 10) = vd5;
  1.1198 +	    *(__m64 *)(dst + 12) = vd6;
  1.1199 +	    *(__m64 *)(dst + 14) = vd7;
  1.1200 +	
  1.1201 +	    w -= 16;
  1.1202 +	    dst += 16;
  1.1203 +	    src += 16;
  1.1204 +	}
  1.1205 +	
  1.1206 +	while (w)
  1.1207 +	{
  1.1208 +	    __m64 s = load8888 (*src);
  1.1209 +	    __m64 d = load8888 (*dst);
  1.1210 +	    
  1.1211 +	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
  1.1212 +	    
  1.1213 +	    w--;
  1.1214 +	    dst++;
  1.1215 +	    src++;
  1.1216 +	}
  1.1217 +    }
  1.1218 +
  1.1219 +    _mm_empty(); 
  1.1220 +}
  1.1221 +
  1.1222 +void
  1.1223 +fbCompositeSrc_8888x8888mmx (CARD8	op,
  1.1224 +			     PicturePtr pSrc,
  1.1225 +			     PicturePtr pMask,
  1.1226 +			     PicturePtr pDst,
  1.1227 +			     INT16	xSrc,
  1.1228 +			     INT16	ySrc,
  1.1229 +			     INT16      xMask,
  1.1230 +			     INT16      yMask,
  1.1231 +			     INT16      xDst,
  1.1232 +			     INT16      yDst,
  1.1233 +			     CARD16     width,
  1.1234 +			     CARD16     height)
  1.1235 +{
  1.1236 +    CARD32	*dstLine, *dst;
  1.1237 +    CARD32	*srcLine, *src;
  1.1238 +    FbStride	dstStride, srcStride;
  1.1239 +    CARD16	w;
  1.1240 +    __m64  srca;
  1.1241 +    
  1.1242 +    CHECKPOINT();
  1.1243 +    
  1.1244 +    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1.1245 +    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1.1246 +
  1.1247 +    srca = MC (4x00ff);
  1.1248 +    
  1.1249 +    while (height--)
  1.1250 +    {
  1.1251 +	dst = dstLine;
  1.1252 +	dstLine += dstStride;
  1.1253 +	src = srcLine;
  1.1254 +	srcLine += srcStride;
  1.1255 +	w = width;
  1.1256 +
  1.1257 +	while (w && (unsigned long)dst & 7)
  1.1258 +	{
  1.1259 +	    __m64 s = load8888 (*src);
  1.1260 +	    __m64 d = load8888 (*dst);
  1.1261 +	    
  1.1262 +	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
  1.1263 +	    
  1.1264 +	    w--;
  1.1265 +	    dst++;
  1.1266 +	    src++;
  1.1267 +	}
  1.1268 +
  1.1269 +	while (w >= 2)
  1.1270 +	{
  1.1271 +	    __m64 vd = *(__m64 *)(dst + 0);
  1.1272 +	    __m64 vs = *(__m64 *)(src + 0);
  1.1273 +	    __m64 vs0 = expand8888 (vs, 0);
  1.1274 +	    __m64 vs1 = expand8888 (vs, 1);
  1.1275 +
  1.1276 +	    *(__m64 *)dst = (__m64)pack8888 (
  1.1277 +		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
  1.1278 +		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
  1.1279 +	    
  1.1280 +	    w -= 2;
  1.1281 +	    dst += 2;
  1.1282 +	    src += 2;
  1.1283 +	}
  1.1284 +	
  1.1285 +	while (w)
  1.1286 +	{
  1.1287 +	    __m64 s = load8888 (*src);
  1.1288 +	    __m64 d = load8888 (*dst);
  1.1289 +	    
  1.1290 +	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
  1.1291 +				     (__m64)_mm_setzero_si64());
  1.1292 +	    
  1.1293 +	    w--;
  1.1294 +	    dst++;
  1.1295 +	    src++;
  1.1296 +	}
  1.1297 +    }
  1.1298 +
  1.1299 +    _mm_empty(); 
  1.1300 +}
  1.1301 +
  1.1302 +void
  1.1303 +fbCompositeSolidMask_nx8x8888mmx (CARD8      op,
  1.1304 +				  PicturePtr pSrc,
  1.1305 +				  PicturePtr pMask,
  1.1306 +				  PicturePtr pDst,
  1.1307 +				  INT16      xSrc,
  1.1308 +				  INT16      ySrc,
  1.1309 +				  INT16      xMask,
  1.1310 +				  INT16      yMask,
  1.1311 +				  INT16      xDst,
  1.1312 +				  INT16      yDst,
  1.1313 +				  CARD16     width,
  1.1314 +				  CARD16     height)
  1.1315 +{
  1.1316 +    CARD32	src, srca;
  1.1317 +    CARD32	*dstLine, *dst;
  1.1318 +    CARD8	*maskLine, *mask;
  1.1319 +    FbStride	dstStride, maskStride;
  1.1320 +    CARD16	w;
  1.1321 +    __m64	vsrc, vsrca;
  1.1322 +    ullong	srcsrc;
  1.1323 +    
  1.1324 +    CHECKPOINT();
  1.1325 +    
  1.1326 +    fbComposeGetSolid(pSrc, src, pDst->format);
  1.1327 +    
  1.1328 +    srca = src >> 24;
  1.1329 +    if (srca == 0)
  1.1330 +	return;
  1.1331 +    
  1.1332 +    srcsrc = (unsigned long long)src << 32 | src;
  1.1333 +    
  1.1334 +    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1.1335 +    fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
  1.1336 +    
  1.1337 +    vsrc = load8888 (src);
  1.1338 +    vsrca = expand_alpha (vsrc);
  1.1339 +    
  1.1340 +    while (height--)
  1.1341 +    {
  1.1342 +	dst = dstLine;
  1.1343 +	dstLine += dstStride;
  1.1344 +	mask = maskLine;
  1.1345 +	maskLine += maskStride;
  1.1346 +	w = width;
  1.1347 +	
  1.1348 +	CHECKPOINT();
  1.1349 +	
  1.1350 +	while (w && (unsigned long)dst & 7)
  1.1351 +	{
  1.1352 +	    ullong m = *mask;
  1.1353 +	    
  1.1354 +	    if (m)
  1.1355 +	    {
  1.1356 +		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
  1.1357 +		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
  1.1358 +	    }
  1.1359 +	    
  1.1360 +	    w--;
  1.1361 +	    mask++;
  1.1362 +	    dst++;
  1.1363 +	}
  1.1364 +	
  1.1365 +	CHECKPOINT();
  1.1366 +	
  1.1367 +	while (w >= 2)
  1.1368 +	{
  1.1369 +	    ullong m0, m1;
  1.1370 +	    m0 = *mask;
  1.1371 +	    m1 = *(mask + 1);
  1.1372 +	    
  1.1373 +	    if (srca == 0xff && (m0 & m1) == 0xff)
  1.1374 +	    {
  1.1375 +		*(unsigned long long *)dst = srcsrc;
  1.1376 +	    }
  1.1377 +	    else if (m0 | m1)
  1.1378 +	    {
  1.1379 +		__m64 vdest;
  1.1380 +		__m64 dest0, dest1;
  1.1381 +		
  1.1382 +		vdest = *(__m64 *)dst;
  1.1383 +		
  1.1384 +		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
  1.1385 +		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
  1.1386 +		
  1.1387 +		*(__m64 *)dst = pack8888(dest0, dest1);
  1.1388 +	    }
  1.1389 +	    
  1.1390 +	    mask += 2;
  1.1391 +	    dst += 2;
  1.1392 +	    w -= 2;
  1.1393 +	}
  1.1394 +	
  1.1395 +	CHECKPOINT();
  1.1396 +	
  1.1397 +	while (w)
  1.1398 +	{
  1.1399 +	    ullong m = *mask;
  1.1400 +	    
  1.1401 +	    if (m)
  1.1402 +	    {
  1.1403 +		__m64 vdest = load8888(*dst);
  1.1404 +		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
  1.1405 +		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
  1.1406 +	    }
  1.1407 +	    
  1.1408 +	    w--;
  1.1409 +	    mask++;
  1.1410 +	    dst++;
  1.1411 +	}
  1.1412 +    }
  1.1413 +    
  1.1414 +    _mm_empty();
  1.1415 +}
  1.1416 +
  1.1417 +
  1.1418 +void
  1.1419 +fbCompositeSolidMask_nx8x0565mmx (CARD8      op,
  1.1420 +				  PicturePtr pSrc,
  1.1421 +				  PicturePtr pMask,
  1.1422 +				  PicturePtr pDst,
  1.1423 +				  INT16      xSrc,
  1.1424 +				  INT16      ySrc,
  1.1425 +				  INT16      xMask,
  1.1426 +				  INT16      yMask,
  1.1427 +				  INT16      xDst,
  1.1428 +				  INT16      yDst,
  1.1429 +				  CARD16     width,
  1.1430 +				  CARD16     height)
  1.1431 +{
  1.1432 +    CARD32	src, srca;
  1.1433 +    CARD16	*dstLine, *dst;
  1.1434 +    CARD8	*maskLine, *mask;
  1.1435 +    FbStride	dstStride, maskStride;
  1.1436 +    CARD16	w;
  1.1437 +    __m64	vsrc, vsrca;
  1.1438 +    unsigned long long srcsrcsrcsrc, src16;
  1.1439 +    
  1.1440 +    CHECKPOINT();
  1.1441 +    
  1.1442 +    fbComposeGetSolid(pSrc, src, pDst->format);
  1.1443 +    
  1.1444 +    srca = src >> 24;
  1.1445 +    if (srca == 0)
  1.1446 +	return;
  1.1447 +    
  1.1448 +    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1.1449 +    fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
  1.1450 +    
  1.1451 +    vsrc = load8888 (src);
  1.1452 +    vsrca = expand_alpha (vsrc);
  1.1453 +    
  1.1454 +    src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
  1.1455 +    
  1.1456 +    srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
  1.1457 +	(ullong)src16 << 16 | (ullong)src16;
  1.1458 +    
  1.1459 +    while (height--)
  1.1460 +    {
  1.1461 +	dst = dstLine;
  1.1462 +	dstLine += dstStride;
  1.1463 +	mask = maskLine;
  1.1464 +	maskLine += maskStride;
  1.1465 +	w = width;
  1.1466 +	
  1.1467 +	CHECKPOINT();
  1.1468 +	
  1.1469 +	while (w && (unsigned long)dst & 7)
  1.1470 +	{
  1.1471 +	    ullong m = *mask;
  1.1472 +	    
  1.1473 +	    if (m)
  1.1474 +	    {
  1.1475 +		ullong d = *dst;
  1.1476 +		__m64 vd = (__m64)d;
  1.1477 +		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
  1.1478 +		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
  1.1479 +	    }
  1.1480 +	    
  1.1481 +	    w--;
  1.1482 +	    mask++;
  1.1483 +	    dst++;
  1.1484 +	}
  1.1485 +	
  1.1486 +	CHECKPOINT();
  1.1487 +	
  1.1488 +	while (w >= 4)
  1.1489 +	{
  1.1490 +	    ullong m0, m1, m2, m3;
  1.1491 +	    m0 = *mask;
  1.1492 +	    m1 = *(mask + 1);
  1.1493 +	    m2 = *(mask + 2);
  1.1494 +	    m3 = *(mask + 3);
  1.1495 +	    
  1.1496 +	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
  1.1497 +	    {
  1.1498 +		*(unsigned long long *)dst = srcsrcsrcsrc;
  1.1499 +	    }
  1.1500 +	    else if (m0 | m1 | m2 | m3)
  1.1501 +	    {
  1.1502 +		__m64 vdest;
  1.1503 +		__m64 vm0, vm1, vm2, vm3;
  1.1504 +		
  1.1505 +		vdest = *(__m64 *)dst;
  1.1506 +		
  1.1507 +		vm0 = (__m64)m0;
  1.1508 +		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
  1.1509 +		vm1 = (__m64)m1;
  1.1510 +		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
  1.1511 +		vm2 = (__m64)m2;
  1.1512 +		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
  1.1513 +		vm3 = (__m64)m3;
  1.1514 +		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
  1.1515 +		
  1.1516 +		*(__m64 *)dst = vdest;
  1.1517 +	    }
  1.1518 +	    
  1.1519 +	    w -= 4;
  1.1520 +	    mask += 4;
  1.1521 +	    dst += 4;
  1.1522 +	}
  1.1523 +	
  1.1524 +	CHECKPOINT();
  1.1525 +	
  1.1526 +	while (w)
  1.1527 +	{
  1.1528 +	    ullong m = *mask;
  1.1529 +	    
  1.1530 +	    if (m)
  1.1531 +	    {
  1.1532 +		ullong d = *dst;
  1.1533 +		__m64 vd = (__m64)d;
  1.1534 +		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
  1.1535 +		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
  1.1536 +	    }
  1.1537 +	    
  1.1538 +	    w--;
  1.1539 +	    mask++;
  1.1540 +	    dst++;
  1.1541 +	}
  1.1542 +    }
  1.1543 +    
  1.1544 +    _mm_empty();
  1.1545 +}
  1.1546 +
  1.1547 +void
  1.1548 +fbCompositeSrc_8888RevNPx0565mmx (CARD8      op,
  1.1549 +				  PicturePtr pSrc,
  1.1550 +				  PicturePtr pMask,
  1.1551 +				  PicturePtr pDst,
  1.1552 +				  INT16      xSrc,
  1.1553 +				  INT16      ySrc,
  1.1554 +				  INT16      xMask,
  1.1555 +				  INT16      yMask,
  1.1556 +				  INT16      xDst,
  1.1557 +				  INT16      yDst,
  1.1558 +				  CARD16     width,
  1.1559 +				  CARD16     height)
  1.1560 +{
  1.1561 +    CARD16	*dstLine, *dst;
  1.1562 +    CARD32	*srcLine, *src;
  1.1563 +    FbStride	dstStride, srcStride;
  1.1564 +    CARD16	w;
  1.1565 +    
  1.1566 +    CHECKPOINT();
  1.1567 +    
  1.1568 +    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1.1569 +    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1.1570 +    
  1.1571 +    assert (pSrc->pDrawable == pMask->pDrawable);
  1.1572 +    
  1.1573 +    while (height--)
  1.1574 +    {
  1.1575 +	dst = dstLine;
  1.1576 +	dstLine += dstStride;
  1.1577 +	src = srcLine;
  1.1578 +	srcLine += srcStride;
  1.1579 +	w = width;
  1.1580 +	
  1.1581 +	CHECKPOINT();
  1.1582 +	
  1.1583 +	while (w && (unsigned long)dst & 7)
  1.1584 +	{
  1.1585 +	    __m64 vsrc = load8888 (*src);
  1.1586 +	    ullong d = *dst;
  1.1587 +	    __m64 vdest = expand565 ((__m64)d, 0);
  1.1588 +	    
  1.1589 +	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
  1.1590 +	    
  1.1591 +	    *dst = (ullong)vdest;
  1.1592 +	    
  1.1593 +	    w--;
  1.1594 +	    dst++;
  1.1595 +	    src++;
  1.1596 +	}
  1.1597 +	
  1.1598 +	CHECKPOINT();
  1.1599 +	
  1.1600 +	while (w >= 4)
  1.1601 +	{
  1.1602 +	    CARD32 s0, s1, s2, s3;
  1.1603 +	    unsigned char a0, a1, a2, a3;
  1.1604 +	    
  1.1605 +	    s0 = *src;
  1.1606 +	    s1 = *(src + 1);
  1.1607 +	    s2 = *(src + 2);
  1.1608 +	    s3 = *(src + 3);
  1.1609 +	    
  1.1610 +	    a0 = (s0 >> 24);
  1.1611 +	    a1 = (s1 >> 24);
  1.1612 +	    a2 = (s2 >> 24);
  1.1613 +	    a3 = (s3 >> 24);
  1.1614 +	    
  1.1615 +	    if ((a0 & a1 & a2 & a3) == 0xFF)
  1.1616 +	    {
  1.1617 +		__m64 vdest;
  1.1618 +		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
  1.1619 +		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
  1.1620 +		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
  1.1621 +		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
  1.1622 +		
  1.1623 +		*(__m64 *)dst = vdest;
  1.1624 +	    }
  1.1625 +	    else if (a0 | a1 | a2 | a3)
  1.1626 +	    {
  1.1627 +		__m64 vdest = *(__m64 *)dst;
  1.1628 +		
  1.1629 +		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
  1.1630 +	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
  1.1631 +		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
  1.1632 +		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
  1.1633 +		
  1.1634 +		*(__m64 *)dst = vdest;
  1.1635 +	    }
  1.1636 +	    
  1.1637 +	    w -= 4;
  1.1638 +	    dst += 4;
  1.1639 +	    src += 4;
  1.1640 +	}
  1.1641 +	
  1.1642 +	CHECKPOINT();
  1.1643 +	
  1.1644 +	while (w)
  1.1645 +	{
  1.1646 +	    __m64 vsrc = load8888 (*src);
  1.1647 +	    ullong d = *dst;
  1.1648 +	    __m64 vdest = expand565 ((__m64)d, 0);
  1.1649 +	    
  1.1650 +	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
  1.1651 +	    
  1.1652 +	    *dst = (ullong)vdest;
  1.1653 +	    
  1.1654 +	    w--;
  1.1655 +	    dst++;
  1.1656 +	    src++;
  1.1657 +	}
  1.1658 +    }
  1.1659 +    
  1.1660 +    _mm_empty();
  1.1661 +}
  1.1662 +
  1.1663 +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
  1.1664 +
  1.1665 +void
  1.1666 +fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
  1.1667 +				  PicturePtr pSrc,
  1.1668 +				  PicturePtr pMask,
  1.1669 +				  PicturePtr pDst,
  1.1670 +				  INT16      xSrc,
  1.1671 +				  INT16      ySrc,
  1.1672 +				  INT16      xMask,
  1.1673 +				  INT16      yMask,
  1.1674 +				  INT16      xDst,
  1.1675 +				  INT16      yDst,
  1.1676 +				  CARD16     width,
  1.1677 +				  CARD16     height)
  1.1678 +{
  1.1679 +    CARD32	*dstLine, *dst;
  1.1680 +    CARD32	*srcLine, *src;
  1.1681 +    FbStride	dstStride, srcStride;
  1.1682 +    CARD16	w;
  1.1683 +    
  1.1684 +    CHECKPOINT();
  1.1685 +    
  1.1686 +    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
  1.1687 +    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
  1.1688 +    
  1.1689 +    assert (pSrc->pDrawable == pMask->pDrawable);
  1.1690 +    
  1.1691 +    while (height--)
  1.1692 +    {
  1.1693 +	dst = dstLine;
  1.1694 +	dstLine += dstStride;
  1.1695 +	src = srcLine;
  1.1696 +	srcLine += srcStride;
  1.1697 +	w = width;
  1.1698 +	
  1.1699 +	while (w && (unsigned long)dst & 7)
  1.1700 +	{
  1.1701 +	    __m64 s = load8888 (*src);
  1.1702 +	    __m64 d = load8888 (*dst);
  1.1703 +	    
  1.1704 +	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
  1.1705 +	    
  1.1706 +	    w--;
  1.1707 +	    dst++;
  1.1708 +	    src++;
  1.1709 +	}
  1.1710 +	
  1.1711 +	while (w >= 2)
  1.1712 +	{
  1.1713 +	    ullong s0, s1;
  1.1714 +	    unsigned char a0, a1;
  1.1715 +	    __m64 d0, d1;
  1.1716 +	    
  1.1717 +	    s0 = *src;
  1.1718 +	    s1 = *(src + 1);
  1.1719 +	    
  1.1720 +	    a0 = (s0 >> 24);
  1.1721 +	    a1 = (s1 >> 24);
  1.1722 +	    
  1.1723 +	    if ((a0 & a1) == 0xFF)
  1.1724 +	    {
  1.1725 +		d0 = invert_colors(load8888(s0));
  1.1726 +		d1 = invert_colors(load8888(s1));
  1.1727 +		
  1.1728 +		*(__m64 *)dst = pack8888 (d0, d1);
  1.1729 +	    }
  1.1730 +	    else if (a0 | a1)
  1.1731 +	    {
  1.1732 +		__m64 vdest = *(__m64 *)dst;
  1.1733 +		
  1.1734 +		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
  1.1735 +		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
  1.1736 +		
  1.1737 +		*(__m64 *)dst = pack8888 (d0, d1);
  1.1738 +	    }
  1.1739 +	    
  1.1740 +	    w -= 2;
  1.1741 +	    dst += 2;
  1.1742 +	    src += 2;
  1.1743 +	}
  1.1744 +	
  1.1745 +	while (w)
  1.1746 +	{
  1.1747 +	    __m64 s = load8888 (*src);
  1.1748 +	    __m64 d = load8888 (*dst);
  1.1749 +	    
  1.1750 +	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
  1.1751 +	    
  1.1752 +	    w--;
  1.1753 +	    dst++;
  1.1754 +	    src++;
  1.1755 +	}
  1.1756 +    }
  1.1757 +    
  1.1758 +    _mm_empty();
  1.1759 +}
  1.1760 +
  1.1761 +void
  1.1762 +fbCompositeSolidMask_nx8888x0565Cmmx (CARD8      op,
  1.1763 +				      PicturePtr pSrc,
  1.1764 +				      PicturePtr pMask,
  1.1765 +				      PicturePtr pDst,
  1.1766 +				      INT16      xSrc,
  1.1767 +				      INT16      ySrc,
  1.1768 +				      INT16      xMask,
  1.1769 +				      INT16      yMask,
  1.1770 +				      INT16      xDst,
  1.1771 +				      INT16      yDst,
  1.1772 +				      CARD16     width,
  1.1773 +				      CARD16     height)
  1.1774 +{
  1.1775 +    CARD32	src, srca;
  1.1776 +    CARD16	*dstLine;
  1.1777 +    CARD32	*maskLine;
  1.1778 +    FbStride	dstStride, maskStride;
  1.1779 +    __m64  vsrc, vsrca;
  1.1780 +    
  1.1781 +    CHECKPOINT();
  1.1782 +    
  1.1783 +    fbComposeGetSolid(pSrc, src, pDst->format);
  1.1784 +    
  1.1785 +    srca = src >> 24;
  1.1786 +    if (srca == 0)
  1.1787 +	return;
  1.1788 +    
  1.1789 +    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
  1.1790 +    fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
  1.1791 +    
  1.1792 +    vsrc = load8888 (src);
  1.1793 +    vsrca = expand_alpha (vsrc);
  1.1794 +    
  1.1795 +    while (height--)
  1.1796 +    {
  1.1797 +	int twidth = width;
  1.1798 +	CARD32 *p = (CARD32 *)maskLine;
  1.1799 +	CARD16 *q = (CARD16 *)dstLine;
  1.1800 +	
  1.1801 +	while (twidth && ((unsigned long)q & 7))
  1.1802 +	{
  1.1803 +	    CARD32 m = *(CARD32 *)p;
  1.1804 +	    
  1.1805 +	    if (m)
  1.1806 +	    {
  1.1807 +		ullong d = *q;
  1.1808 +		__m64 vdest = expand565 ((__m64)d, 0);
  1.1809 +		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
  1.1810 +		*q = (ullong)vdest;
  1.1811 +	    }
  1.1812 +	    
  1.1813 +	    twidth--;
  1.1814 +	    p++;
  1.1815 +	    q++;
  1.1816 +	}
  1.1817 +	
  1.1818 +	while (twidth >= 4)
  1.1819 +	{
  1.1820 +	    CARD32 m0, m1, m2, m3;
  1.1821 +	    
  1.1822 +	    m0 = *p;
  1.1823 +	    m1 = *(p + 1);
  1.1824 +	    m2 = *(p + 2);
  1.1825 +	    m3 = *(p + 3);
  1.1826 +	    
  1.1827 +	    if ((m0 | m1 | m2 | m3))
  1.1828 +	    {
  1.1829 +		__m64 vdest = *(__m64 *)q;
  1.1830 +		
  1.1831 +		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
  1.1832 +		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
  1.1833 +		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
  1.1834 +		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
  1.1835 +		
  1.1836 +		*(__m64 *)q = vdest;
  1.1837 +	    }
  1.1838 +	    twidth -= 4;
  1.1839 +	    p += 4;
  1.1840 +	    q += 4;
  1.1841 +	}
  1.1842 +	
  1.1843 +	while (twidth)
  1.1844 +	{
  1.1845 +	    CARD32 m;
  1.1846 +	    
  1.1847 +	    m = *(CARD32 *)p;
  1.1848 +	    if (m)
  1.1849 +	    {
  1.1850 +		ullong d = *q;
  1.1851 +		__m64 vdest = expand565((__m64)d, 0);
  1.1852 +		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
  1.1853 +		*q = (ullong)vdest;
  1.1854 +	    }
  1.1855 +	    
  1.1856 +	    twidth--;
  1.1857 +	    p++;
  1.1858 +	    q++;
  1.1859 +	}
  1.1860 +	
  1.1861 +	maskLine += maskStride;
  1.1862 +	dstLine += dstStride;
  1.1863 +    }
  1.1864 +    
  1.1865 +    _mm_empty ();
  1.1866 +}
  1.1867 +#endif
  1.1868 +
  1.1869 +static void
  1.1870 +fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
  1.1871 +{
  1.1872 +    int s;
  1.1873 +    int d;
  1.1874 +    int t;
  1.1875 +
  1.1876 +    while (w && (unsigned long)dst & 7)
  1.1877 +    {
  1.1878 +        s = *src;
  1.1879 +        d = *dst;
  1.1880 +        t = d + s;
  1.1881 +        s = t | (0 - (t >> 8));
  1.1882 +        *dst = s;
  1.1883 +        
  1.1884 +        dst++;
  1.1885 +        src++;
  1.1886 +        w--;
  1.1887 +    }
  1.1888 +    
  1.1889 +    while (w >= 8)
  1.1890 +    {
  1.1891 +        *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
  1.1892 +        dst += 8;
  1.1893 +        src += 8;
  1.1894 +        w -= 8;
  1.1895 +    }
  1.1896 +    
  1.1897 +    while (w)
  1.1898 +    {
  1.1899 +        s = *src;
  1.1900 +        d = *dst;
  1.1901 +        t = d + s;
  1.1902 +        s = t | (0 - (t >> 8));
  1.1903 +        *dst = s;
  1.1904 +        
  1.1905 +        dst++;
  1.1906 +        src++;
  1.1907 +        w--;
  1.1908 +    }
  1.1909 +
  1.1910 +    _mm_empty();
  1.1911 +}
  1.1912 +OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
  1.1913 +
  1.1914 +static void
  1.1915 +fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
  1.1916 +{
  1.1917 +    while (w && (unsigned long)dst & 7)
  1.1918 +    {
  1.1919 +        *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
  1.1920 +                                             _mm_cvtsi32_si64(*dst)));
  1.1921 +        dst++;
  1.1922 +        src++;
  1.1923 +        w--;
  1.1924 +    }
  1.1925 +    
  1.1926 +    while (w >= 2)
  1.1927 +    {
  1.1928 +        *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
  1.1929 +        dst += 2;
  1.1930 +        src += 2;
  1.1931 +        w -= 2;
  1.1932 +    }
  1.1933 +    
  1.1934 +    if (w)
  1.1935 +    {
  1.1936 +        *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
  1.1937 +                                             _mm_cvtsi32_si64(*dst)));
  1.1938 +        
  1.1939 +    }
  1.1940 +    
  1.1941 +    _mm_empty();
  1.1942 +}
  1.1943 +OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
  1.1944 +
  1.1945 +#if 0
  1.1946 +#define GetStart(drw,x,y,type,stride,line,bpp) {\
  1.1947 +    FbBits	*__bits__;									\
  1.1948 +    FbStride	__stride__;									\
  1.1949 +    int		__xoff__,__yoff__;								\
  1.1950 +												\
  1.1951 +    fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__);				\
  1.1952 +    (stride) = __stride__ * sizeof (FbBits) / sizeof (type);					\
  1.1953 +    (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__);		\
  1.1954 +}
  1.1955 +
  1.1956 +Bool
  1.1957 +fbSolidFillmmx (DrawablePtr	pDraw,
  1.1958 +		int		x,
  1.1959 +		int		y,
  1.1960 +		int		width,
  1.1961 +		int		height,
  1.1962 +		FbBits		xor)
  1.1963 +{ 
  1.1964 +    FbStride	stride;
  1.1965 +    int		bpp;
  1.1966 +    ullong	fill;
  1.1967 +    __m64	vfill;
  1.1968 +    CARD32	byte_width;
  1.1969 +    CARD8	*byte_line;
  1.1970 +    FbBits      *bits;
  1.1971 +    int		xoff, yoff;
  1.1972 +    
  1.1973 +    CHECKPOINT();
  1.1974 +    
  1.1975 +    fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
  1.1976 +    
  1.1977 +    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
  1.1978 +	return FALSE;
  1.1979 +    
  1.1980 +    if (bpp != 16 && bpp != 32)
  1.1981 +	return FALSE;
  1.1982 +    
  1.1983 +    if (bpp == 16)
  1.1984 +    {
  1.1985 +	stride = stride * sizeof (FbBits) / 2;
  1.1986 +	byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
  1.1987 +	byte_width = 2 * width;
  1.1988 +	stride *= 2;
  1.1989 +    }
  1.1990 +    else
  1.1991 +    {
  1.1992 +	stride = stride * sizeof (FbBits) / 4;
  1.1993 +	byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
  1.1994 +	byte_width = 4 * width;
  1.1995 +	stride *= 4;
  1.1996 +    }
  1.1997 +    
  1.1998 +    fill = ((ullong)xor << 32) | xor;
  1.1999 +    vfill = (__m64)fill;
  1.2000 +    
  1.2001 +    while (height--)
  1.2002 +    {
  1.2003 +	int w;
  1.2004 +	CARD8 *d = byte_line;
  1.2005 +	byte_line += stride;
  1.2006 +	w = byte_width;
  1.2007 +	
  1.2008 +	while (w >= 2 && ((unsigned long)d & 3))
  1.2009 +	{
  1.2010 +	    *(CARD16 *)d = xor;
  1.2011 +	    w -= 2;
  1.2012 +	    d += 2;
  1.2013 +	}
  1.2014 +	
  1.2015 +	while (w >= 4 && ((unsigned long)d & 7))
  1.2016 +	{
  1.2017 +	    *(CARD32 *)d = xor;
  1.2018 +	    
  1.2019 +	    w -= 4;
  1.2020 +	    d += 4;
  1.2021 +	}
  1.2022 +	
  1.2023 +	while (w >= 64)
  1.2024 +	{
  1.2025 +	    *(__m64*) (d +  0) = vfill;
  1.2026 +	    *(__m64*) (d +  8) = vfill;
  1.2027 +	    *(__m64*) (d + 16) = vfill;
  1.2028 +	    *(__m64*) (d + 24) = vfill;
  1.2029 +	    *(__m64*) (d + 32) = vfill;
  1.2030 +	    *(__m64*) (d + 40) = vfill;
  1.2031 +	    *(__m64*) (d + 48) = vfill;
  1.2032 +	    *(__m64*) (d + 56) = vfill;
  1.2033 +	    
  1.2034 +	    w -= 64;
  1.2035 +	    d += 64;
  1.2036 +	}
  1.2037 +	while (w >= 4)
  1.2038 +	{
  1.2039 +	    *(CARD32 *)d = xor;
  1.2040 +	    
  1.2041 +	    w -= 4;
  1.2042 +	    d += 4;
  1.2043 +	}
  1.2044 +	if (w >= 2)
  1.2045 +	{
  1.2046 +	    *(CARD16 *)d = xor;
  1.2047 +	    w -= 2;
  1.2048 +	    d += 2;
  1.2049 +	}
  1.2050 +    }
  1.2051 +    
  1.2052 +    _mm_empty();
  1.2053 +    return TRUE;
  1.2054 +}
  1.2055 +
  1.2056 +Bool
  1.2057 +fbCopyAreammx (DrawablePtr	pSrc,
  1.2058 +	       DrawablePtr	pDst,
  1.2059 +	       int		src_x,
  1.2060 +	       int		src_y,
  1.2061 +	       int		dst_x,
  1.2062 +	       int		dst_y,
  1.2063 +	       int		width,
  1.2064 +	       int		height)
  1.2065 +{
  1.2066 +    FbBits *	src_bits;
  1.2067 +    FbStride	src_stride;
  1.2068 +    int		src_bpp;
  1.2069 +    int		src_xoff;
  1.2070 +    int		src_yoff;
  1.2071 +
  1.2072 +    FbBits *	dst_bits;
  1.2073 +    FbStride	dst_stride;
  1.2074 +    int		dst_bpp;
  1.2075 +    int		dst_xoff;
  1.2076 +    int		dst_yoff;
  1.2077 +
  1.2078 +    CARD8 *	src_bytes;
  1.2079 +    CARD8 *	dst_bytes;
  1.2080 +    int		byte_width;
  1.2081 +    
  1.2082 +    fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
  1.2083 +    fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
  1.2084 +
  1.2085 +    if (src_bpp != 16 && src_bpp != 32)
  1.2086 +	return FALSE;
  1.2087 +
  1.2088 +    if (dst_bpp != 16 && dst_bpp != 32)
  1.2089 +	return FALSE;
  1.2090 +
  1.2091 +    if (src_bpp != dst_bpp)
  1.2092 +    {
  1.2093 +	return FALSE;
  1.2094 +    }
  1.2095 +    
  1.2096 +    if (src_bpp == 16)
  1.2097 +    {
  1.2098 +	src_stride = src_stride * sizeof (FbBits) / 2;
  1.2099 +	dst_stride = dst_stride * sizeof (FbBits) / 2;
  1.2100 +	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
  1.2101 +	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
  1.2102 +	byte_width = 2 * width;
  1.2103 +	src_stride *= 2;
  1.2104 +	dst_stride *= 2;
  1.2105 +    }
  1.2106 +    else
  1.2107 +    {
  1.2108 +	src_stride = src_stride * sizeof (FbBits) / 4;
  1.2109 +	dst_stride = dst_stride * sizeof (FbBits) / 4;
  1.2110 +	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
  1.2111 +	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
  1.2112 +	byte_width = 4 * width;
  1.2113 +	src_stride *= 4;
  1.2114 +	dst_stride *= 4;
  1.2115 +    }
  1.2116 +
  1.2117 +    while (height--)
  1.2118 +    {
  1.2119 +	int w;
  1.2120 +	CARD8 *s = src_bytes;
  1.2121 +	CARD8 *d = dst_bytes;
  1.2122 +	src_bytes += src_stride;
  1.2123 +	dst_bytes += dst_stride;
  1.2124 +	w = byte_width;
  1.2125 +	
  1.2126 +	while (w >= 2 && ((unsigned long)d & 3))
  1.2127 +	{
  1.2128 +	    *(CARD16 *)d = *(CARD16 *)s;
  1.2129 +	    w -= 2;
  1.2130 +	    s += 2;
  1.2131 +	    d += 2;
  1.2132 +	}
  1.2133 +	
  1.2134 +	while (w >= 4 && ((unsigned long)d & 7))
  1.2135 +	{
  1.2136 +	    *(CARD32 *)d = *(CARD32 *)s;
  1.2137 +	    
  1.2138 +	    w -= 4;
  1.2139 +	    s += 4;
  1.2140 +	    d += 4;
  1.2141 +	}
  1.2142 +	
  1.2143 +	while (w >= 64)
  1.2144 +	{
  1.2145 +	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
  1.2146 +	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
  1.2147 +	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
  1.2148 +	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
  1.2149 +	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
  1.2150 +	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
  1.2151 +	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
  1.2152 +	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
  1.2153 +	    w -= 64;
  1.2154 +	    s += 64;
  1.2155 +	    d += 64;
  1.2156 +	}
  1.2157 +	while (w >= 4)
  1.2158 +	{
  1.2159 +	    *(CARD32 *)d = *(CARD32 *)s;
  1.2160 +
  1.2161 +	    w -= 4;
  1.2162 +	    s += 4;
  1.2163 +	    d += 4;
  1.2164 +	}
  1.2165 +	if (w >= 2)
  1.2166 +	{
  1.2167 +	    *(CARD16 *)d = *(CARD16 *)s;
  1.2168 +	    w -= 2;
  1.2169 +	    s += 2;
  1.2170 +	    d += 2;
  1.2171 +	}
  1.2172 +    }
  1.2173 +    
  1.2174 +    _mm_empty();
  1.2175 +    return TRUE;
  1.2176 +}
  1.2177 +
  1.2178 +void
  1.2179 +fbCompositeCopyAreammx (CARD8		op,
  1.2180 +			PicturePtr	pSrc,
  1.2181 +			PicturePtr	pMask,
  1.2182 +			PicturePtr	pDst,
  1.2183 +			INT16		xSrc,
  1.2184 +			INT16		ySrc,
  1.2185 +			INT16		xMask,
  1.2186 +			INT16		yMask,
  1.2187 +			INT16		xDst,
  1.2188 +			INT16		yDst,
  1.2189 +			CARD16		width,
  1.2190 +			CARD16		height)
  1.2191 +{
  1.2192 +    fbCopyAreammx (pSrc->pDrawable,
  1.2193 +		   pDst->pDrawable,
  1.2194 +		   xSrc, ySrc,
  1.2195 +		   xDst, yDst,
  1.2196 +		   width, height);
  1.2197 +}
  1.2198 +
  1.2199 +#if !defined(__amd64__) && !defined(__x86_64__)
  1.2200 +
  1.2201 +enum CPUFeatures {
  1.2202 +    NoFeatures = 0,
  1.2203 +    MMX = 0x1,
  1.2204 +    MMX_Extensions = 0x2, 
  1.2205 +    SSE = 0x6,
  1.2206 +    SSE2 = 0x8,
  1.2207 +    CMOV = 0x10
  1.2208 +};
  1.2209 +
  1.2210 +static unsigned int detectCPUFeatures(void) {
  1.2211 +    unsigned int result;
  1.2212 +    char vendor[13];
  1.2213 +    vendor[0] = 0;
  1.2214 +    vendor[12] = 0;
  1.2215 +    /* see p. 118 of amd64 instruction set manual Vol3 */
  1.2216 +    __asm__ ("push %%ebx\n"
  1.2217 +             "pushf\n"
  1.2218 +             "pop %%eax\n"
  1.2219 +             "mov %%eax, %%ebx\n"
  1.2220 +             "xor $0x00200000, %%eax\n"
  1.2221 +             "push %%eax\n"
  1.2222 +             "popf\n"
  1.2223 +             "pushf\n"
  1.2224 +             "pop %%eax\n"
  1.2225 +             "mov $0x0, %%edx\n"
  1.2226 +             "xor %%ebx, %%eax\n"
  1.2227 +             "jz skip\n"
  1.2228 +
  1.2229 +             "mov $0x00000000, %%eax\n"
  1.2230 +             "cpuid\n"
  1.2231 +             "mov %%ebx, %1\n"
  1.2232 +             "mov %%edx, %2\n"
  1.2233 +             "mov %%ecx, %3\n"
  1.2234 +             "mov $0x00000001, %%eax\n"
  1.2235 +             "cpuid\n"
  1.2236 +             "skip:\n"
  1.2237 +             "pop %%ebx\n"
  1.2238 +             "mov %%edx, %0\n"
  1.2239 +             : "=r" (result), 
  1.2240 +               "=m" (vendor[0]), 
  1.2241 +               "=m" (vendor[4]), 
  1.2242 +               "=m" (vendor[8])
  1.2243 +             :
  1.2244 +             : "%eax", "%ecx", "%edx"
  1.2245 +        );
  1.2246 +
  1.2247 +    unsigned int features = 0;
  1.2248 +    if (result) {
  1.2249 +        /* result now contains the standard feature bits */
  1.2250 +        if (result & (1 << 15))
  1.2251 +            features |= CMOV;
  1.2252 +        if (result & (1 << 23))
  1.2253 +            features |= MMX;
  1.2254 +        if (result & (1 << 25))
  1.2255 +            features |= SSE;
  1.2256 +        if (result & (1 << 26))
  1.2257 +            features |= SSE2;
  1.2258 +        if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
  1.2259 +            /* check for AMD MMX extensions */
  1.2260 +
  1.2261 +            unsigned int result;            
  1.2262 +            __asm__("push %%ebx\n"
  1.2263 +                    "mov $0x80000000, %%eax\n"
  1.2264 +                    "cpuid\n"
  1.2265 +                    "xor %%edx, %%edx\n"
  1.2266 +                    "cmp $0x1, %%eax\n"
  1.2267 +                    "jge skip2\n"
  1.2268 +                    "mov $0x80000001, %%eax\n"
  1.2269 +                    "cpuid\n"
  1.2270 +                    "skip2:\n"
  1.2271 +                    "mov %%edx, %0\n"
  1.2272 +                    "pop %%ebx\n"
  1.2273 +                    : "=r" (result)
  1.2274 +                    :
  1.2275 +                    : "%eax", "%ecx", "%edx"
  1.2276 +                );
  1.2277 +            if (result & (1<<22))
  1.2278 +                features |= MMX_Extensions;
  1.2279 +        }
  1.2280 +    }
  1.2281 +    return features;
  1.2282 +}
  1.2283 +
  1.2284 +Bool
  1.2285 +fbHaveMMX (void)
  1.2286 +{
  1.2287 +    static Bool initialized = FALSE;
  1.2288 +    static Bool mmx_present;
  1.2289 +    
  1.2290 +    if (!initialized)
  1.2291 +    {
  1.2292 +        unsigned int features = detectCPUFeatures();
  1.2293 +	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
  1.2294 +        initialized = TRUE;
  1.2295 +    }
  1.2296 +    
  1.2297 +    return mmx_present;
  1.2298 +}
  1.2299 +#endif /* __amd64__ */
  1.2300 +
  1.2301 +
  1.2302 +#endif
  1.2303 +
  1.2304 +
  1.2305 +#ifdef	__SYMBIAN32__
  1.2306 + 
  1.2307 +OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() {
  1.2308 +		return &_oil_function_impl_mmxCombineOverU, composite_over_argb;
  1.2309 +}
  1.2310 +#endif
  1.2311 +
  1.2312 +#ifdef	__SYMBIAN32__
  1.2313 + 
  1.2314 +OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() {
  1.2315 +		return &_oil_function_impl_mmxCombineAddU, composite_add_argb;
  1.2316 +}
  1.2317 +#endif
  1.2318 +
  1.2319 +#ifdef	__SYMBIAN32__
  1.2320 + 
  1.2321 +OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() {
  1.2322 +		return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src;
  1.2323 +}
  1.2324 +#endif
  1.2325 +
  1.2326 +#ifdef	__SYMBIAN32__
  1.2327 + 
  1.2328 +OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() {
  1.2329 +		return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8;
  1.2330 +}
  1.2331 +#endif
  1.2332 +
  1.2333 +#ifdef	__SYMBIAN32__
  1.2334 + 
  1.2335 +OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() {
  1.2336 +		return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb;
  1.2337 +}
  1.2338 +#endif
  1.2339 +