1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/fb/fbmmx.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,2336 @@
1.4 +/*
1.5 + * Copyright © 2004 Red Hat, Inc.
1.6 + * Copyright © 2004 Nicholas Miell
1.7 + * Copyright © 2005 Trolltech AS
1.8 + *
1.9 + * Permission to use, copy, modify, distribute, and sell this software and its
1.10 + * documentation for any purpose is hereby granted without fee, provided that
1.11 + * the above copyright notice appear in all copies and that both that
1.12 + * copyright notice and this permission notice appear in supporting
1.13 + * documentation, and that the name of Red Hat not be used in advertising or
1.14 + * publicity pertaining to distribution of the software without specific,
1.15 + * written prior permission. Red Hat makes no representations about the
1.16 + * suitability of this software for any purpose. It is provided "as is"
1.17 + * without express or implied warranty.
1.18 + *
1.19 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
1.20 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
1.21 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
1.22 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1.23 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
1.24 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
1.25 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
1.26 + * SOFTWARE.
1.27 + *
1.28 + * Author: Søren Sandmann (sandmann@redhat.com)
1.29 + * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
1.30 + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
1.31 + *
1.32 + * Based on work by Owen Taylor
1.33 + */
1.34 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.35 +
1.36 +#ifdef HAVE_CONFIG_H
1.37 +#include "config.h"
1.38 +#endif
1.39 +
1.40 +#include <liboil/liboil.h>
1.41 +#include <liboil/liboilfunction.h>
1.42 +
1.43 +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
1.44 +
1.45 +typedef uint32_t CARD32;
1.46 +typedef uint16_t CARD16;
1.47 +typedef int16_t INT16;
1.48 +typedef uint8_t CARD8;
1.49 +typedef uint64_t ullong;
1.50 +typedef CARD32* PicturePtr;
1.51 +typedef CARD32* FbBits;
1.52 +typedef int FbStride;
1.53 +
1.54 +
1.55 +#include "fbmmx.h"
1.56 +#include "fbpict.h"
1.57 +
1.58 +#define CHECKPOINT()
1.59 +
1.60 +OIL_DECLARE_CLASS (composite_in_argb);
1.61 +OIL_DECLARE_CLASS (composite_in_argb_const_src);
1.62 +OIL_DECLARE_CLASS (composite_in_argb_const_mask);
1.63 +OIL_DECLARE_CLASS (composite_over_argb);
1.64 +OIL_DECLARE_CLASS (composite_over_argb_const_src);
1.65 +OIL_DECLARE_CLASS (composite_add_argb);
1.66 +OIL_DECLARE_CLASS (composite_add_argb_const_src);
1.67 +OIL_DECLARE_CLASS (composite_in_over_argb);
1.68 +OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
1.69 +OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
1.70 +OIL_DECLARE_CLASS (composite_over_u8);
1.71 +OIL_DECLARE_CLASS (composite_add_u8);
1.72 +
1.73 +
1.74 +/* --------------- MMX code patch for fbcompose.c --------------------- */
1.75 +
1.76 +#if 0
1.77 +static void
1.78 +mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
1.79 +{
1.80 + const __m64 mmx_0 = _mm_setzero_si64();
1.81 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.82 +
1.83 + const uint32_t *end = mask + width;
1.84 + while (mask < end) {
1.85 + __m64 a = MmxTo(*mask);
1.86 + __m64 s = MmxTo(*src);
1.87 + a = MmxAlpha(a);
1.88 + MmxMul(s, a);
1.89 + *dest = MmxFrom(s);
1.90 + ++src;
1.91 + ++dest;
1.92 + ++mask;
1.93 + }
1.94 + _mm_empty();
1.95 +}
1.96 +#endif
1.97 +
1.98 +#ifdef ENABLE_BROKEN_IMPLS
1.99 +static void
1.100 +mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
1.101 +{
1.102 + const __m64 mmx_0 = _mm_setzero_si64();
1.103 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.104 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.105 +
1.106 + const uint32_t *end = dest + width;
1.107 +
1.108 + while (dest < end) {
1.109 + __m64 x, y, a;
1.110 + x = MmxTo(*src);
1.111 + y = MmxTo(*dest);
1.112 + a = MmxAlpha(x);
1.113 + a = MmxNegate(a);
1.114 + MmxMulAdd(y, a, x);
1.115 + *dest = MmxFrom(y);
1.116 + ++dest;
1.117 + ++src;
1.118 + }
1.119 + _mm_empty();
1.120 +}
1.121 +OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
1.122 +#endif
1.123 +
1.124 +#if 0
1.125 +static FASTCALL void
1.126 +mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
1.127 +{
1.128 + const __m64 mmx_0 = _mm_setzero_si64();
1.129 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.130 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.131 +
1.132 + const CARD32 *end = dest + width;
1.133 +
1.134 + while (dest < end) {
1.135 + __m64 x, y, a;
1.136 + x = MmxTo(*dest);
1.137 + y = MmxTo(*src);
1.138 + a = MmxAlpha(x);
1.139 + a = MmxNegate(a);
1.140 + MmxMulAdd(y, a, x);
1.141 + *dest = MmxFrom(y);
1.142 + ++dest;
1.143 + ++src;
1.144 + }
1.145 + _mm_empty();
1.146 +}
1.147 +#endif
1.148 +
1.149 +#if 0
1.150 +static void
1.151 +mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
1.152 +{
1.153 + const __m64 mmx_0 = _mm_setzero_si64();
1.154 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.155 +
1.156 + const CARD32 *end = dest + width;
1.157 +
1.158 + while (dest < end) {
1.159 + __m64 x, a;
1.160 + x = MmxTo(*src);
1.161 + a = MmxTo(*dest);
1.162 + a = MmxAlpha(a);
1.163 + MmxMul(x, a);
1.164 + *dest = MmxFrom(x);
1.165 + ++dest;
1.166 + ++src;
1.167 + }
1.168 + _mm_empty();
1.169 +}
1.170 +#endif
1.171 +
1.172 +#if 0
1.173 +static FASTCALL void
1.174 +mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
1.175 +{
1.176 + const __m64 mmx_0 = _mm_setzero_si64();
1.177 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.178 +
1.179 + const CARD32 *end = dest + width;
1.180 +
1.181 + while (dest < end) {
1.182 + __m64 x, a;
1.183 + x = MmxTo(*dest);
1.184 + a = MmxTo(*src);
1.185 + a = MmxAlpha(a);
1.186 + MmxMul(x, a);
1.187 + *dest = MmxFrom(x);
1.188 + ++dest;
1.189 + ++src;
1.190 + }
1.191 + _mm_empty();
1.192 +}
1.193 +#endif
1.194 +
1.195 +#if 0
1.196 +static FASTCALL void
1.197 +mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
1.198 +{
1.199 + const __m64 mmx_0 = _mm_setzero_si64();
1.200 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.201 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.202 +
1.203 + const CARD32 *end = dest + width;
1.204 +
1.205 + while (dest < end) {
1.206 + __m64 x, a;
1.207 + x = MmxTo(*src);
1.208 + a = MmxTo(*dest);
1.209 + a = MmxAlpha(a);
1.210 + a = MmxNegate(a);
1.211 + MmxMul(x, a);
1.212 + *dest = MmxFrom(x);
1.213 + ++dest;
1.214 + ++src;
1.215 + }
1.216 + _mm_empty();
1.217 +}
1.218 +#endif
1.219 +
1.220 +#if 0
1.221 +static FASTCALL void
1.222 +mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
1.223 +{
1.224 + const __m64 mmx_0 = _mm_setzero_si64();
1.225 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.226 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.227 +
1.228 + const CARD32 *end = dest + width;
1.229 +
1.230 + while (dest < end) {
1.231 + __m64 x, a;
1.232 + x = MmxTo(*dest);
1.233 + a = MmxTo(*src);
1.234 + a = MmxAlpha(a);
1.235 + a = MmxNegate(a);
1.236 + MmxMul(x, a);
1.237 + *dest = MmxFrom(x);
1.238 + ++dest;
1.239 + ++src;
1.240 + }
1.241 + _mm_empty();
1.242 +}
1.243 +
1.244 +static FASTCALL void
1.245 +mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
1.246 +{
1.247 + const __m64 mmx_0 = _mm_setzero_si64();
1.248 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.249 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.250 +
1.251 + const CARD32 *end = dest + width;
1.252 +
1.253 + while (dest < end) {
1.254 + __m64 s, da, d, sia;
1.255 + s = MmxTo(*src);
1.256 + d = MmxTo(*dest);
1.257 + sia = MmxAlpha(s);
1.258 + sia = MmxNegate(sia);
1.259 + da = MmxAlpha(d);
1.260 + MmxAddMul(s, da, d, sia);
1.261 + *dest = MmxFrom(s);
1.262 + ++dest;
1.263 + ++src;
1.264 + }
1.265 + _mm_empty();
1.266 +}
1.267 +
1.268 +static FASTCALL void
1.269 +mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
1.270 +{
1.271 + const __m64 mmx_0 = _mm_setzero_si64();
1.272 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.273 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.274 +
1.275 + const CARD32 *end;
1.276 +
1.277 + end = dest + width;
1.278 +
1.279 + while (dest < end) {
1.280 + __m64 s, dia, d, sa;
1.281 + s = MmxTo(*src);
1.282 + d = MmxTo(*dest);
1.283 + sa = MmxAlpha(s);
1.284 + dia = MmxAlpha(d);
1.285 + dia = MmxNegate(dia);
1.286 + MmxAddMul(s, dia, d, sa);
1.287 + *dest = MmxFrom(s);
1.288 + ++dest;
1.289 + ++src;
1.290 + }
1.291 + _mm_empty();
1.292 +}
1.293 +
1.294 +static FASTCALL void
1.295 +mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
1.296 +{
1.297 + const __m64 mmx_0 = _mm_setzero_si64();
1.298 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.299 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.300 +
1.301 + const CARD32 *end = dest + width;
1.302 +
1.303 + while (dest < end) {
1.304 + __m64 s, dia, d, sia;
1.305 + s = MmxTo(*src);
1.306 + d = MmxTo(*dest);
1.307 + sia = MmxAlpha(s);
1.308 + dia = MmxAlpha(d);
1.309 + sia = MmxNegate(sia);
1.310 + dia = MmxNegate(dia);
1.311 + MmxAddMul(s, dia, d, sia);
1.312 + *dest = MmxFrom(s);
1.313 + ++dest;
1.314 + ++src;
1.315 + }
1.316 + _mm_empty();
1.317 +}
1.318 +#endif
1.319 +
1.320 +static void
1.321 +mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
1.322 +{
1.323 + const __m64 mmx_0 = _mm_setzero_si64();
1.324 +
1.325 + const uint32_t *end = dest + width;
1.326 + while (dest < end) {
1.327 + __m64 s, d;
1.328 + s = MmxTo(*src);
1.329 + d = MmxTo(*dest);
1.330 + s = MmxAdd(s, d);
1.331 + *dest = MmxFrom(s);
1.332 + ++dest;
1.333 + ++src;
1.334 + }
1.335 + _mm_empty();
1.336 +}
1.337 +OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
1.338 +
1.339 +#if 0
1.340 +static FASTCALL void
1.341 +mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
1.342 +{
1.343 + const __m64 mmx_0 = _mm_setzero_si64();
1.344 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.345 +
1.346 + const CARD32 *end = dest + width;
1.347 + while (dest < end) {
1.348 + CARD32 s = *src;
1.349 + CARD32 d = *dest;
1.350 + __m64 ms = MmxTo(s);
1.351 + __m64 md = MmxTo(d);
1.352 + CARD32 sa = s >> 24;
1.353 + CARD32 da = ~d >> 24;
1.354 +
1.355 + if (sa > da) {
1.356 + __m64 msa = MmxTo(FbIntDiv(da, sa));
1.357 + msa = MmxAlpha(msa);
1.358 + MmxMul(ms, msa);
1.359 + }
1.360 + MmxAdd(md, ms);
1.361 + *dest = MmxFrom(md);
1.362 + ++src;
1.363 + ++dest;
1.364 + }
1.365 + _mm_empty();
1.366 +}
1.367 +
1.368 +
1.369 +static FASTCALL void
1.370 +mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.371 +{
1.372 + const __m64 mmx_0 = _mm_setzero_si64();
1.373 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.374 +
1.375 + const CARD32 *end = src + width;
1.376 + while (src < end) {
1.377 + __m64 a = MmxTo(*mask);
1.378 + __m64 s = MmxTo(*src);
1.379 + MmxMul(s, a);
1.380 + *dest = MmxFrom(s);
1.381 + ++src;
1.382 + ++mask;
1.383 + ++dest;
1.384 + }
1.385 + _mm_empty();
1.386 +}
1.387 +
1.388 +static FASTCALL void
1.389 +mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.390 +{
1.391 + const __m64 mmx_0 = _mm_setzero_si64();
1.392 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.393 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.394 +
1.395 + const CARD32 *end = src + width;
1.396 + while (src < end) {
1.397 + __m64 a = MmxTo(*mask);
1.398 + __m64 s = MmxTo(*src);
1.399 + __m64 d = MmxTo(*dest);
1.400 + __m64 sa = MmxAlpha(s);
1.401 + MmxMul(s, a);
1.402 + MmxMul(a, sa);
1.403 + a = MmxNegate(a);
1.404 + MmxMulAdd(d, a, s);
1.405 + *dest = MmxFrom(d);
1.406 + ++src;
1.407 + ++dest;
1.408 + ++mask;
1.409 + }
1.410 + _mm_empty();
1.411 +}
1.412 +
1.413 +static FASTCALL void
1.414 +mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.415 +{
1.416 + const __m64 mmx_0 = _mm_setzero_si64();
1.417 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.418 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.419 +
1.420 + const CARD32 *end = src + width;
1.421 + while (src < end) {
1.422 + __m64 a = MmxTo(*mask);
1.423 + __m64 s = MmxTo(*src);
1.424 + __m64 d = MmxTo(*dest);
1.425 + __m64 da = MmxAlpha(d);
1.426 + da = MmxNegate(da);
1.427 + MmxMul(s, a);
1.428 + MmxMulAdd(s, da, d);
1.429 + *dest = MmxFrom(s);
1.430 + ++src;
1.431 + ++dest;
1.432 + ++mask;
1.433 + }
1.434 + _mm_empty();
1.435 +}
1.436 +
1.437 +
1.438 +static FASTCALL void
1.439 +mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.440 +{
1.441 + const __m64 mmx_0 = _mm_setzero_si64();
1.442 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.443 +
1.444 + const CARD32 *end = src + width;
1.445 + while (src < end) {
1.446 + __m64 a = MmxTo(*mask);
1.447 + __m64 s = MmxTo(*src);
1.448 + __m64 d = MmxTo(*dest);
1.449 + __m64 da = MmxAlpha(d);
1.450 + MmxMul(s, a);
1.451 + MmxMul(s, da);
1.452 + *dest = MmxFrom(s);
1.453 + ++src;
1.454 + ++dest;
1.455 + ++mask;
1.456 + }
1.457 + _mm_empty();
1.458 +}
1.459 +
1.460 +static FASTCALL void
1.461 +mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.462 +{
1.463 + const __m64 mmx_0 = _mm_setzero_si64();
1.464 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.465 +
1.466 + const CARD32 *end = src + width;
1.467 + while (src < end) {
1.468 + __m64 a = MmxTo(*mask);
1.469 + __m64 s = MmxTo(*src);
1.470 + __m64 d = MmxTo(*dest);
1.471 + __m64 sa = MmxAlpha(s);
1.472 + MmxMul(a, sa);
1.473 + MmxMul(d, a);
1.474 + *dest = MmxFrom(d);
1.475 + ++src;
1.476 + ++dest;
1.477 + ++mask;
1.478 + }
1.479 + _mm_empty();
1.480 +}
1.481 +
1.482 +static FASTCALL void
1.483 +mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.484 +{
1.485 + const __m64 mmx_0 = _mm_setzero_si64();
1.486 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.487 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.488 +
1.489 + const CARD32 *end = src + width;
1.490 + while (src < end) {
1.491 + __m64 a = MmxTo(*mask);
1.492 + __m64 s = MmxTo(*src);
1.493 + __m64 d = MmxTo(*dest);
1.494 + __m64 da = MmxAlpha(d);
1.495 + da = MmxNegate(da);
1.496 + MmxMul(s, a);
1.497 + MmxMul(s, da);
1.498 + *dest = MmxFrom(s);
1.499 + ++src;
1.500 + ++dest;
1.501 + ++mask;
1.502 + }
1.503 + _mm_empty();
1.504 +}
1.505 +
1.506 +static FASTCALL void
1.507 +mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.508 +{
1.509 + const __m64 mmx_0 = _mm_setzero_si64();
1.510 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.511 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.512 +
1.513 + const CARD32 *end = src + width;
1.514 + while (src < end) {
1.515 + __m64 a = MmxTo(*mask);
1.516 + __m64 s = MmxTo(*src);
1.517 + __m64 d = MmxTo(*dest);
1.518 + __m64 sa = MmxAlpha(s);
1.519 + MmxMul(a, sa);
1.520 + a = MmxNegate(a);
1.521 + MmxMul(d, a);
1.522 + *dest = MmxFrom(d);
1.523 + ++src;
1.524 + ++dest;
1.525 + ++mask;
1.526 + }
1.527 + _mm_empty();
1.528 +}
1.529 +
1.530 +static FASTCALL void
1.531 +mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.532 +{
1.533 + const __m64 mmx_0 = _mm_setzero_si64();
1.534 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.535 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.536 +
1.537 + const CARD32 *end = src + width;
1.538 + while (src < end) {
1.539 + __m64 a = MmxTo(*mask);
1.540 + __m64 s = MmxTo(*src);
1.541 + __m64 d = MmxTo(*dest);
1.542 + __m64 da = MmxAlpha(d);
1.543 + __m64 sa = MmxAlpha(s);
1.544 + MmxMul(s, a);
1.545 + MmxMul(a, sa);
1.546 + a = MmxNegate(a);
1.547 + MmxAddMul(d, a, s, da);
1.548 + *dest = MmxFrom(d);
1.549 + ++src;
1.550 + ++dest;
1.551 + ++mask;
1.552 + }
1.553 + _mm_empty();
1.554 +}
1.555 +
1.556 +static FASTCALL void
1.557 +mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.558 +{
1.559 + const __m64 mmx_0 = _mm_setzero_si64();
1.560 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.561 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.562 +
1.563 + const CARD32 *end = src + width;
1.564 + while (src < end) {
1.565 + __m64 a = MmxTo(*mask);
1.566 + __m64 s = MmxTo(*src);
1.567 + __m64 d = MmxTo(*dest);
1.568 + __m64 da = MmxAlpha(d);
1.569 + __m64 sa = MmxAlpha(s)
1.570 + MmxMul(s, a);
1.571 + MmxMul(a, sa);
1.572 + da = MmxNegate(da);
1.573 + MmxAddMul(d, a, s, da);
1.574 + *dest = MmxFrom(d);
1.575 + ++src;
1.576 + ++dest;
1.577 + ++mask;
1.578 + }
1.579 + _mm_empty();
1.580 +}
1.581 +
1.582 +static FASTCALL void
1.583 +mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.584 +{
1.585 + const __m64 mmx_0 = _mm_setzero_si64();
1.586 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.587 + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
1.588 +
1.589 + const CARD32 *end = src + width;
1.590 + while (src < end) {
1.591 + __m64 a = MmxTo(*mask);
1.592 + __m64 s = MmxTo(*src);
1.593 + __m64 d = MmxTo(*dest);
1.594 + __m64 da = MmxAlpha(d);
1.595 + __m64 sa = MmxAlpha(s);
1.596 + MmxMul(s, a);
1.597 + MmxMul(a, sa);
1.598 + da = MmxNegate(da);
1.599 + a = MmxNegate(a);
1.600 + MmxAddMul(d, a, s, da);
1.601 + *dest = MmxFrom(d);
1.602 + ++src;
1.603 + ++dest;
1.604 + ++mask;
1.605 + }
1.606 + _mm_empty();
1.607 +}
1.608 +
1.609 +static FASTCALL void
1.610 +mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
1.611 +{
1.612 + const __m64 mmx_0 = _mm_setzero_si64();
1.613 + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
1.614 +
1.615 + const CARD32 *end = src + width;
1.616 + while (src < end) {
1.617 + __m64 a = MmxTo(*mask);
1.618 + __m64 s = MmxTo(*src);
1.619 + __m64 d = MmxTo(*dest);
1.620 + MmxMul(s, a);
1.621 + d = MmxAdd(s, d);
1.622 + *dest = MmxFrom(d);
1.623 + ++src;
1.624 + ++dest;
1.625 + ++mask;
1.626 + }
1.627 + _mm_empty();
1.628 +}
1.629 +
1.630 +extern FbComposeFunctions composeFunctions;
1.631 +
1.632 +void fbComposeSetupMMX(void)
1.633 +{
1.634 + /* check if we have MMX support and initialize accordingly */
1.635 + if (fbHaveMMX()) {
1.636 + composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
1.637 + composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
1.638 + composeFunctions.combineU[PictOpIn] = mmxCombineInU;
1.639 + composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
1.640 + composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
1.641 + composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
1.642 + composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
1.643 + composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
1.644 + composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
1.645 + composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
1.646 + composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
1.647 +
1.648 + composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
1.649 + composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
1.650 + composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
1.651 + composeFunctions.combineC[PictOpIn] = mmxCombineInC;
1.652 + composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
1.653 + composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
1.654 + composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
1.655 + composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
1.656 + composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
1.657 + composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
1.658 + composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
1.659 +
1.660 + composeFunctions.combineMaskU = mmxCombineMaskU;
1.661 + }
1.662 +}
1.663 +#endif
1.664 +
1.665 +
1.666 +/* ------------------ MMX code paths called from fbpict.c ----------------------- */
1.667 +
1.668 +typedef union {
1.669 + __m64 m64;
1.670 + uint64_t ull;
1.671 +} m64_ull;
1.672 +
1.673 +typedef struct
1.674 +{
1.675 + m64_ull mmx_4x00ff;
1.676 + m64_ull mmx_4x0080;
1.677 + m64_ull mmx_565_rgb;
1.678 + m64_ull mmx_565_unpack_multiplier;
1.679 + m64_ull mmx_565_r;
1.680 + m64_ull mmx_565_g;
1.681 + m64_ull mmx_565_b;
1.682 + m64_ull mmx_mask_0;
1.683 + m64_ull mmx_mask_1;
1.684 + m64_ull mmx_mask_2;
1.685 + m64_ull mmx_mask_3;
1.686 + m64_ull mmx_full_alpha;
1.687 + m64_ull mmx_ffff0000ffff0000;
1.688 + m64_ull mmx_0000ffff00000000;
1.689 + m64_ull mmx_000000000000ffff;
1.690 +} MMXData;
1.691 +
1.692 +static const MMXData c =
1.693 +{
1.694 + .mmx_4x00ff.ull = 0x00ff00ff00ff00ffULL,
1.695 + .mmx_4x0080.ull = 0x0080008000800080ULL,
1.696 + .mmx_565_rgb.ull = 0x000001f0003f001fULL,
1.697 + .mmx_565_r.ull = 0x000000f800000000ULL,
1.698 + .mmx_565_g.ull = 0x0000000000fc0000ULL,
1.699 + .mmx_565_b.ull = 0x00000000000000f8ULL,
1.700 + .mmx_mask_0.ull = 0xffffffffffff0000ULL,
1.701 + .mmx_mask_1.ull = 0xffffffff0000ffffULL,
1.702 + .mmx_mask_2.ull = 0xffff0000ffffffffULL,
1.703 + .mmx_mask_3.ull = 0x0000ffffffffffffULL,
1.704 + .mmx_full_alpha.ull = 0x00ff000000000000ULL,
1.705 + .mmx_565_unpack_multiplier.ull = 0x0000008404100840ULL,
1.706 + .mmx_ffff0000ffff0000.ull = 0xffff0000ffff0000ULL,
1.707 + .mmx_0000ffff00000000.ull = 0x0000ffff00000000ULL,
1.708 + .mmx_000000000000ffff.ull = 0x000000000000ffffULL,
1.709 +};
1.710 +
1.711 +#define MC(x) ((__m64) c.mmx_##x.m64)
1.712 +
1.713 +static __inline__ __m64
1.714 +shift (__m64 v, int s)
1.715 +{
1.716 + if (s > 0)
1.717 + return _mm_slli_si64 (v, s);
1.718 + else if (s < 0)
1.719 + return _mm_srli_si64 (v, -s);
1.720 + else
1.721 + return v;
1.722 +}
1.723 +
1.724 +static __inline__ __m64
1.725 +negate (__m64 mask)
1.726 +{
1.727 + return _mm_xor_si64 (mask, MC(4x00ff));
1.728 +}
1.729 +
1.730 +static __inline__ __m64
1.731 +pix_multiply (__m64 a, __m64 b)
1.732 +{
1.733 + __m64 res;
1.734 +
1.735 + res = _mm_mullo_pi16 (a, b);
1.736 + res = _mm_adds_pu16 (res, MC(4x0080));
1.737 + res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
1.738 + res = _mm_srli_pi16 (res, 8);
1.739 +
1.740 + return res;
1.741 +}
1.742 +
1.743 +static __inline__ __m64
1.744 +expand_alpha (__m64 pixel)
1.745 +{
1.746 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
1.747 +}
1.748 +
1.749 +static __inline__ __m64
1.750 +expand_alpha_rev (__m64 pixel)
1.751 +{
1.752 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
1.753 +}
1.754 +
1.755 +static __inline__ __m64
1.756 +invert_colors (__m64 pixel)
1.757 +{
1.758 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
1.759 +}
1.760 +
1.761 +/* Notes about writing mmx code
1.762 + *
1.763 + * give memory operands as the second operand. If you give it as the
1.764 + * first, gcc will first load it into a register, then use that
1.765 + * register
1.766 + *
1.767 + * ie. use
1.768 + *
1.769 + * _mm_mullo_pi16 (x, mmx_constant);
1.770 + *
1.771 + * not
1.772 + *
1.773 + * _mm_mullo_pi16 (mmx_constant, x);
1.774 + *
1.775 + * Also try to minimize dependencies. i.e. when you need a value, try
1.776 + * to calculate it from a value that was calculated as early as
1.777 + * possible.
1.778 + */
1.779 +
1.780 +static __inline__ __m64
1.781 +over (__m64 src, __m64 srca, __m64 dest)
1.782 +{
1.783 + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
1.784 +}
1.785 +
1.786 +static __inline__ __m64
1.787 +over_rev_non_pre (__m64 src, __m64 dest)
1.788 +{
1.789 + __m64 srca = expand_alpha (src);
1.790 + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
1.791 +
1.792 + return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
1.793 +}
1.794 +
1.795 +static __inline__ __m64
1.796 +in (__m64 src,
1.797 + __m64 mask)
1.798 +{
1.799 + return pix_multiply (src, mask);
1.800 +}
1.801 +
1.802 +static __inline__ __m64
1.803 +in_over (__m64 src,
1.804 + __m64 srca,
1.805 + __m64 mask,
1.806 + __m64 dest)
1.807 +{
1.808 + return over(in(src, mask), pix_multiply(srca, mask), dest);
1.809 +}
1.810 +
1.811 +static __inline__ __m64
1.812 +load8888 (CARD32 v)
1.813 +{
1.814 + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
1.815 +}
1.816 +
1.817 +static __inline__ __m64
1.818 +pack8888 (__m64 lo, __m64 hi)
1.819 +{
1.820 + __m64 r;
1.821 + r = _mm_packs_pu16 (lo, hi);
1.822 + return r;
1.823 +}
1.824 +
1.825 +static __inline__ CARD32
1.826 +store8888 (__m64 v)
1.827 +{
1.828 + return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
1.829 +}
1.830 +
1.831 +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
1.832 + *
1.833 + * 00RR00GG00BB
1.834 + *
1.835 + * --- Expanding 565 in the low word ---
1.836 + *
1.837 + * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
1.838 + * m = m & (01f0003f001f);
1.839 + * m = m * (008404100840);
1.840 + * m = m >> 8;
1.841 + *
1.842 + * Note the trick here - the top word is shifted by another nibble to
1.843 + * avoid it bumping into the middle word
1.844 + */
1.845 +static __inline__ __m64
1.846 +expand565 (__m64 pixel, int pos)
1.847 +{
1.848 + __m64 p = pixel;
1.849 + __m64 t1, t2;
1.850 +
1.851 + /* move pixel to low 16 bit and zero the rest */
1.852 + p = shift (shift (p, (3 - pos) * 16), -48);
1.853 +
1.854 + t1 = shift (p, 36 - 11);
1.855 + t2 = shift (p, 16 - 5);
1.856 +
1.857 + p = _mm_or_si64 (t1, p);
1.858 + p = _mm_or_si64 (t2, p);
1.859 + p = _mm_and_si64 (p, MC(565_rgb));
1.860 +
1.861 + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
1.862 + return _mm_srli_pi16 (pixel, 8);
1.863 +}
1.864 +
1.865 +static __inline__ __m64
1.866 +expand8888 (__m64 in, int pos)
1.867 +{
1.868 + if (pos == 0)
1.869 + return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
1.870 + else
1.871 + return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
1.872 +}
1.873 +
1.874 +static __inline__ __m64
1.875 +pack565 (__m64 pixel, __m64 target, int pos)
1.876 +{
1.877 + __m64 p = pixel;
1.878 + __m64 t = target;
1.879 + __m64 r, g, b;
1.880 +
1.881 + r = _mm_and_si64 (p, MC(565_r));
1.882 + g = _mm_and_si64 (p, MC(565_g));
1.883 + b = _mm_and_si64 (p, MC(565_b));
1.884 +
1.885 + r = shift (r, - (32 - 8) + pos * 16);
1.886 + g = shift (g, - (16 - 3) + pos * 16);
1.887 + b = shift (b, - (0 + 3) + pos * 16);
1.888 +
1.889 + if (pos == 0)
1.890 + t = _mm_and_si64 (t, MC(mask_0));
1.891 + else if (pos == 1)
1.892 + t = _mm_and_si64 (t, MC(mask_1));
1.893 + else if (pos == 2)
1.894 + t = _mm_and_si64 (t, MC(mask_2));
1.895 + else if (pos == 3)
1.896 + t = _mm_and_si64 (t, MC(mask_3));
1.897 +
1.898 + p = _mm_or_si64 (r, t);
1.899 + p = _mm_or_si64 (g, p);
1.900 +
1.901 + return _mm_or_si64 (b, p);
1.902 +}
1.903 +
1.904 +#ifdef ENABLE_BROKEN_IMPLS
1.905 +/* broken. See Debian bug #340932 */
1.906 +static void
1.907 +fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
1.908 +{
1.909 + __m64 vsrc, vsrca;
1.910 +
1.911 + vsrc = load8888 (*src);
1.912 + vsrca = expand_alpha (vsrc);
1.913 +
1.914 + while (w && (unsigned long)dst & 7)
1.915 + {
1.916 + *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
1.917 +
1.918 + w--;
1.919 + dst++;
1.920 + }
1.921 +
1.922 + while (w >= 2)
1.923 + {
1.924 + __m64 vdest;
1.925 + __m64 dest0, dest1;
1.926 +
1.927 + vdest = *(__m64 *)dst;
1.928 +
1.929 + dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
1.930 + dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
1.931 +
1.932 + *(__m64 *)dst = pack8888(dest0, dest1);
1.933 +
1.934 + dst += 2;
1.935 + w -= 2;
1.936 + }
1.937 +
1.938 + while (w)
1.939 + {
1.940 + *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
1.941 +
1.942 + w--;
1.943 + dst++;
1.944 + }
1.945 +
1.946 + _mm_empty();
1.947 +}
1.948 +OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
1.949 + OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
1.950 +#endif
1.951 +
1.952 +#if 0
1.953 +void
1.954 +fbCompositeSolid_nx0565mmx (CARD8 op,
1.955 + PicturePtr pSrc,
1.956 + PicturePtr pMask,
1.957 + PicturePtr pDst,
1.958 + INT16 xSrc,
1.959 + INT16 ySrc,
1.960 + INT16 xMask,
1.961 + INT16 yMask,
1.962 + INT16 xDst,
1.963 + INT16 yDst,
1.964 + CARD16 width,
1.965 + CARD16 height)
1.966 +{
1.967 + CARD32 src;
1.968 + CARD16 *dstLine, *dst;
1.969 + CARD16 w;
1.970 + FbStride dstStride;
1.971 + __m64 vsrc, vsrca;
1.972 +
1.973 + CHECKPOINT();
1.974 +
1.975 + fbComposeGetSolid(pSrc, src, pDst->format);
1.976 +
1.977 + if (src >> 24 == 0)
1.978 + return;
1.979 +
1.980 + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1.981 +
1.982 + vsrc = load8888 (src);
1.983 + vsrca = expand_alpha (vsrc);
1.984 +
1.985 + while (height--)
1.986 + {
1.987 + dst = dstLine;
1.988 + dstLine += dstStride;
1.989 + w = width;
1.990 +
1.991 + CHECKPOINT();
1.992 +
1.993 + while (w && (unsigned long)dst & 7)
1.994 + {
1.995 + ullong d = *dst;
1.996 + __m64 vdest = expand565 ((__m64)d, 0);
1.997 + vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1.998 + *dst = (ullong)vdest;
1.999 +
1.1000 + w--;
1.1001 + dst++;
1.1002 + }
1.1003 +
1.1004 + while (w >= 4)
1.1005 + {
1.1006 + __m64 vdest;
1.1007 +
1.1008 + vdest = *(__m64 *)dst;
1.1009 +
1.1010 + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
1.1011 + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
1.1012 + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
1.1013 + vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
1.1014 +
1.1015 + *(__m64 *)dst = vdest;
1.1016 +
1.1017 + dst += 4;
1.1018 + w -= 4;
1.1019 + }
1.1020 +
1.1021 + CHECKPOINT();
1.1022 +
1.1023 + while (w)
1.1024 + {
1.1025 + ullong d = *dst;
1.1026 + __m64 vdest = expand565 ((__m64)d, 0);
1.1027 + vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1.1028 + *dst = (ullong)vdest;
1.1029 +
1.1030 + w--;
1.1031 + dst++;
1.1032 + }
1.1033 + }
1.1034 +
1.1035 + _mm_empty();
1.1036 +}
1.1037 +#endif
1.1038 +
1.1039 +#if 0
1.1040 +static void
1.1041 +fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
1.1042 +{
1.1043 + CARD32 src, srca;
1.1044 + CARD32 *dstLine;
1.1045 + CARD32 *maskLine;
1.1046 + FbStride dstStride, maskStride;
1.1047 + __m64 vsrc, vsrca;
1.1048 +
1.1049 +
1.1050 + while (twidth && (unsigned long)q & 7)
1.1051 + {
1.1052 + CARD32 m = *(CARD32 *)p;
1.1053 +
1.1054 + if (m)
1.1055 + {
1.1056 + __m64 vdest = load8888(*q);
1.1057 + vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1.1058 + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1.1059 + }
1.1060 +
1.1061 + twidth--;
1.1062 + p++;
1.1063 + q++;
1.1064 + }
1.1065 +
1.1066 + while (twidth >= 2)
1.1067 + {
1.1068 + CARD32 m0, m1;
1.1069 + m0 = *p;
1.1070 + m1 = *(p + 1);
1.1071 +
1.1072 + if (m0 | m1)
1.1073 + {
1.1074 + __m64 dest0, dest1;
1.1075 + __m64 vdest = *(__m64 *)q;
1.1076 +
1.1077 + dest0 = in_over(vsrc, vsrca, load8888(m0),
1.1078 + expand8888 (vdest, 0));
1.1079 + dest1 = in_over(vsrc, vsrca, load8888(m1),
1.1080 + expand8888 (vdest, 1));
1.1081 +
1.1082 + *(__m64 *)q = pack8888(dest0, dest1);
1.1083 + }
1.1084 +
1.1085 + p += 2;
1.1086 + q += 2;
1.1087 + twidth -= 2;
1.1088 + }
1.1089 +
1.1090 + while (twidth)
1.1091 + {
1.1092 + CARD32 m = *(CARD32 *)p;
1.1093 +
1.1094 + if (m)
1.1095 + {
1.1096 + __m64 vdest = load8888(*q);
1.1097 + vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1.1098 + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1.1099 + }
1.1100 +
1.1101 + twidth--;
1.1102 + p++;
1.1103 + q++;
1.1104 + }
1.1105 +
1.1106 + _mm_empty();
1.1107 +}
1.1108 +#endif
1.1109 +
1.1110 +#if 0
1.1111 +static void
1.1112 +fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
1.1113 + int width)
1.1114 +{
1.1115 +
1.1116 + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
1.1117 + vmask = load8888 (mask);
1.1118 + srca = MC(4x00ff);
1.1119 +
1.1120 + while (height--)
1.1121 + {
1.1122 + dst = dstLine;
1.1123 + dstLine += dstStride;
1.1124 + src = srcLine;
1.1125 + srcLine += srcStride;
1.1126 + w = width;
1.1127 +
1.1128 + while (w && (unsigned long)dst & 7)
1.1129 + {
1.1130 + __m64 s = load8888 (*src);
1.1131 + __m64 d = load8888 (*dst);
1.1132 +
1.1133 + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1.1134 +
1.1135 + w--;
1.1136 + dst++;
1.1137 + src++;
1.1138 + }
1.1139 +
1.1140 + while (w >= 16)
1.1141 + {
1.1142 + __m64 vd0 = *(__m64 *)(dst + 0);
1.1143 + __m64 vd1 = *(__m64 *)(dst + 2);
1.1144 + __m64 vd2 = *(__m64 *)(dst + 4);
1.1145 + __m64 vd3 = *(__m64 *)(dst + 6);
1.1146 + __m64 vd4 = *(__m64 *)(dst + 8);
1.1147 + __m64 vd5 = *(__m64 *)(dst + 10);
1.1148 + __m64 vd6 = *(__m64 *)(dst + 12);
1.1149 + __m64 vd7 = *(__m64 *)(dst + 14);
1.1150 +
1.1151 + __m64 vs0 = *(__m64 *)(src + 0);
1.1152 + __m64 vs1 = *(__m64 *)(src + 2);
1.1153 + __m64 vs2 = *(__m64 *)(src + 4);
1.1154 + __m64 vs3 = *(__m64 *)(src + 6);
1.1155 + __m64 vs4 = *(__m64 *)(src + 8);
1.1156 + __m64 vs5 = *(__m64 *)(src + 10);
1.1157 + __m64 vs6 = *(__m64 *)(src + 12);
1.1158 + __m64 vs7 = *(__m64 *)(src + 14);
1.1159 +
1.1160 + vd0 = (__m64)pack8888 (
1.1161 + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1.1162 + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1.1163 +
1.1164 + vd1 = (__m64)pack8888 (
1.1165 + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1.1166 + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1.1167 +
1.1168 + vd2 = (__m64)pack8888 (
1.1169 + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1.1170 + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1.1171 +
1.1172 + vd3 = (__m64)pack8888 (
1.1173 + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1.1174 + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1.1175 +
1.1176 + vd4 = (__m64)pack8888 (
1.1177 + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1.1178 + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1.1179 +
1.1180 + vd5 = (__m64)pack8888 (
1.1181 + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1.1182 + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1.1183 +
1.1184 + vd6 = (__m64)pack8888 (
1.1185 + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1.1186 + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1.1187 +
1.1188 + vd7 = (__m64)pack8888 (
1.1189 + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1.1190 + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1.1191 +
1.1192 + *(__m64 *)(dst + 0) = vd0;
1.1193 + *(__m64 *)(dst + 2) = vd1;
1.1194 + *(__m64 *)(dst + 4) = vd2;
1.1195 + *(__m64 *)(dst + 6) = vd3;
1.1196 + *(__m64 *)(dst + 8) = vd4;
1.1197 + *(__m64 *)(dst + 10) = vd5;
1.1198 + *(__m64 *)(dst + 12) = vd6;
1.1199 + *(__m64 *)(dst + 14) = vd7;
1.1200 +
1.1201 + w -= 16;
1.1202 + dst += 16;
1.1203 + src += 16;
1.1204 + }
1.1205 +
1.1206 + while (w)
1.1207 + {
1.1208 + __m64 s = load8888 (*src);
1.1209 + __m64 d = load8888 (*dst);
1.1210 +
1.1211 + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1.1212 +
1.1213 + w--;
1.1214 + dst++;
1.1215 + src++;
1.1216 + }
1.1217 + }
1.1218 +
1.1219 + _mm_empty();
1.1220 +}
1.1221 +
1.1222 +void
1.1223 +fbCompositeSrc_8888x8888mmx (CARD8 op,
1.1224 + PicturePtr pSrc,
1.1225 + PicturePtr pMask,
1.1226 + PicturePtr pDst,
1.1227 + INT16 xSrc,
1.1228 + INT16 ySrc,
1.1229 + INT16 xMask,
1.1230 + INT16 yMask,
1.1231 + INT16 xDst,
1.1232 + INT16 yDst,
1.1233 + CARD16 width,
1.1234 + CARD16 height)
1.1235 +{
1.1236 + CARD32 *dstLine, *dst;
1.1237 + CARD32 *srcLine, *src;
1.1238 + FbStride dstStride, srcStride;
1.1239 + CARD16 w;
1.1240 + __m64 srca;
1.1241 +
1.1242 + CHECKPOINT();
1.1243 +
1.1244 + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1.1245 + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1.1246 +
1.1247 + srca = MC (4x00ff);
1.1248 +
1.1249 + while (height--)
1.1250 + {
1.1251 + dst = dstLine;
1.1252 + dstLine += dstStride;
1.1253 + src = srcLine;
1.1254 + srcLine += srcStride;
1.1255 + w = width;
1.1256 +
1.1257 + while (w && (unsigned long)dst & 7)
1.1258 + {
1.1259 + __m64 s = load8888 (*src);
1.1260 + __m64 d = load8888 (*dst);
1.1261 +
1.1262 + *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
1.1263 +
1.1264 + w--;
1.1265 + dst++;
1.1266 + src++;
1.1267 + }
1.1268 +
1.1269 + while (w >= 2)
1.1270 + {
1.1271 + __m64 vd = *(__m64 *)(dst + 0);
1.1272 + __m64 vs = *(__m64 *)(src + 0);
1.1273 + __m64 vs0 = expand8888 (vs, 0);
1.1274 + __m64 vs1 = expand8888 (vs, 1);
1.1275 +
1.1276 + *(__m64 *)dst = (__m64)pack8888 (
1.1277 + over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
1.1278 + over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
1.1279 +
1.1280 + w -= 2;
1.1281 + dst += 2;
1.1282 + src += 2;
1.1283 + }
1.1284 +
1.1285 + while (w)
1.1286 + {
1.1287 + __m64 s = load8888 (*src);
1.1288 + __m64 d = load8888 (*dst);
1.1289 +
1.1290 + *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
1.1291 + (__m64)_mm_setzero_si64());
1.1292 +
1.1293 + w--;
1.1294 + dst++;
1.1295 + src++;
1.1296 + }
1.1297 + }
1.1298 +
1.1299 + _mm_empty();
1.1300 +}
1.1301 +
1.1302 +void
1.1303 +fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
1.1304 + PicturePtr pSrc,
1.1305 + PicturePtr pMask,
1.1306 + PicturePtr pDst,
1.1307 + INT16 xSrc,
1.1308 + INT16 ySrc,
1.1309 + INT16 xMask,
1.1310 + INT16 yMask,
1.1311 + INT16 xDst,
1.1312 + INT16 yDst,
1.1313 + CARD16 width,
1.1314 + CARD16 height)
1.1315 +{
1.1316 + CARD32 src, srca;
1.1317 + CARD32 *dstLine, *dst;
1.1318 + CARD8 *maskLine, *mask;
1.1319 + FbStride dstStride, maskStride;
1.1320 + CARD16 w;
1.1321 + __m64 vsrc, vsrca;
1.1322 + ullong srcsrc;
1.1323 +
1.1324 + CHECKPOINT();
1.1325 +
1.1326 + fbComposeGetSolid(pSrc, src, pDst->format);
1.1327 +
1.1328 + srca = src >> 24;
1.1329 + if (srca == 0)
1.1330 + return;
1.1331 +
1.1332 + srcsrc = (unsigned long long)src << 32 | src;
1.1333 +
1.1334 + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1.1335 + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1.1336 +
1.1337 + vsrc = load8888 (src);
1.1338 + vsrca = expand_alpha (vsrc);
1.1339 +
1.1340 + while (height--)
1.1341 + {
1.1342 + dst = dstLine;
1.1343 + dstLine += dstStride;
1.1344 + mask = maskLine;
1.1345 + maskLine += maskStride;
1.1346 + w = width;
1.1347 +
1.1348 + CHECKPOINT();
1.1349 +
1.1350 + while (w && (unsigned long)dst & 7)
1.1351 + {
1.1352 + ullong m = *mask;
1.1353 +
1.1354 + if (m)
1.1355 + {
1.1356 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1.1357 + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1.1358 + }
1.1359 +
1.1360 + w--;
1.1361 + mask++;
1.1362 + dst++;
1.1363 + }
1.1364 +
1.1365 + CHECKPOINT();
1.1366 +
1.1367 + while (w >= 2)
1.1368 + {
1.1369 + ullong m0, m1;
1.1370 + m0 = *mask;
1.1371 + m1 = *(mask + 1);
1.1372 +
1.1373 + if (srca == 0xff && (m0 & m1) == 0xff)
1.1374 + {
1.1375 + *(unsigned long long *)dst = srcsrc;
1.1376 + }
1.1377 + else if (m0 | m1)
1.1378 + {
1.1379 + __m64 vdest;
1.1380 + __m64 dest0, dest1;
1.1381 +
1.1382 + vdest = *(__m64 *)dst;
1.1383 +
1.1384 + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1.1385 + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1.1386 +
1.1387 + *(__m64 *)dst = pack8888(dest0, dest1);
1.1388 + }
1.1389 +
1.1390 + mask += 2;
1.1391 + dst += 2;
1.1392 + w -= 2;
1.1393 + }
1.1394 +
1.1395 + CHECKPOINT();
1.1396 +
1.1397 + while (w)
1.1398 + {
1.1399 + ullong m = *mask;
1.1400 +
1.1401 + if (m)
1.1402 + {
1.1403 + __m64 vdest = load8888(*dst);
1.1404 + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1.1405 + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1.1406 + }
1.1407 +
1.1408 + w--;
1.1409 + mask++;
1.1410 + dst++;
1.1411 + }
1.1412 + }
1.1413 +
1.1414 + _mm_empty();
1.1415 +}
1.1416 +
1.1417 +
1.1418 +void
1.1419 +fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
1.1420 + PicturePtr pSrc,
1.1421 + PicturePtr pMask,
1.1422 + PicturePtr pDst,
1.1423 + INT16 xSrc,
1.1424 + INT16 ySrc,
1.1425 + INT16 xMask,
1.1426 + INT16 yMask,
1.1427 + INT16 xDst,
1.1428 + INT16 yDst,
1.1429 + CARD16 width,
1.1430 + CARD16 height)
1.1431 +{
1.1432 + CARD32 src, srca;
1.1433 + CARD16 *dstLine, *dst;
1.1434 + CARD8 *maskLine, *mask;
1.1435 + FbStride dstStride, maskStride;
1.1436 + CARD16 w;
1.1437 + __m64 vsrc, vsrca;
1.1438 + unsigned long long srcsrcsrcsrc, src16;
1.1439 +
1.1440 + CHECKPOINT();
1.1441 +
1.1442 + fbComposeGetSolid(pSrc, src, pDst->format);
1.1443 +
1.1444 + srca = src >> 24;
1.1445 + if (srca == 0)
1.1446 + return;
1.1447 +
1.1448 + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1.1449 + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1.1450 +
1.1451 + vsrc = load8888 (src);
1.1452 + vsrca = expand_alpha (vsrc);
1.1453 +
1.1454 + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1.1455 +
1.1456 + srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1.1457 + (ullong)src16 << 16 | (ullong)src16;
1.1458 +
1.1459 + while (height--)
1.1460 + {
1.1461 + dst = dstLine;
1.1462 + dstLine += dstStride;
1.1463 + mask = maskLine;
1.1464 + maskLine += maskStride;
1.1465 + w = width;
1.1466 +
1.1467 + CHECKPOINT();
1.1468 +
1.1469 + while (w && (unsigned long)dst & 7)
1.1470 + {
1.1471 + ullong m = *mask;
1.1472 +
1.1473 + if (m)
1.1474 + {
1.1475 + ullong d = *dst;
1.1476 + __m64 vd = (__m64)d;
1.1477 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1.1478 + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1.1479 + }
1.1480 +
1.1481 + w--;
1.1482 + mask++;
1.1483 + dst++;
1.1484 + }
1.1485 +
1.1486 + CHECKPOINT();
1.1487 +
1.1488 + while (w >= 4)
1.1489 + {
1.1490 + ullong m0, m1, m2, m3;
1.1491 + m0 = *mask;
1.1492 + m1 = *(mask + 1);
1.1493 + m2 = *(mask + 2);
1.1494 + m3 = *(mask + 3);
1.1495 +
1.1496 + if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
1.1497 + {
1.1498 + *(unsigned long long *)dst = srcsrcsrcsrc;
1.1499 + }
1.1500 + else if (m0 | m1 | m2 | m3)
1.1501 + {
1.1502 + __m64 vdest;
1.1503 + __m64 vm0, vm1, vm2, vm3;
1.1504 +
1.1505 + vdest = *(__m64 *)dst;
1.1506 +
1.1507 + vm0 = (__m64)m0;
1.1508 + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
1.1509 + vm1 = (__m64)m1;
1.1510 + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
1.1511 + vm2 = (__m64)m2;
1.1512 + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
1.1513 + vm3 = (__m64)m3;
1.1514 + vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
1.1515 +
1.1516 + *(__m64 *)dst = vdest;
1.1517 + }
1.1518 +
1.1519 + w -= 4;
1.1520 + mask += 4;
1.1521 + dst += 4;
1.1522 + }
1.1523 +
1.1524 + CHECKPOINT();
1.1525 +
1.1526 + while (w)
1.1527 + {
1.1528 + ullong m = *mask;
1.1529 +
1.1530 + if (m)
1.1531 + {
1.1532 + ullong d = *dst;
1.1533 + __m64 vd = (__m64)d;
1.1534 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1.1535 + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1.1536 + }
1.1537 +
1.1538 + w--;
1.1539 + mask++;
1.1540 + dst++;
1.1541 + }
1.1542 + }
1.1543 +
1.1544 + _mm_empty();
1.1545 +}
1.1546 +
1.1547 +void
1.1548 +fbCompositeSrc_8888RevNPx0565mmx (CARD8 op,
1.1549 + PicturePtr pSrc,
1.1550 + PicturePtr pMask,
1.1551 + PicturePtr pDst,
1.1552 + INT16 xSrc,
1.1553 + INT16 ySrc,
1.1554 + INT16 xMask,
1.1555 + INT16 yMask,
1.1556 + INT16 xDst,
1.1557 + INT16 yDst,
1.1558 + CARD16 width,
1.1559 + CARD16 height)
1.1560 +{
1.1561 + CARD16 *dstLine, *dst;
1.1562 + CARD32 *srcLine, *src;
1.1563 + FbStride dstStride, srcStride;
1.1564 + CARD16 w;
1.1565 +
1.1566 + CHECKPOINT();
1.1567 +
1.1568 + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1.1569 + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1.1570 +
1.1571 + assert (pSrc->pDrawable == pMask->pDrawable);
1.1572 +
1.1573 + while (height--)
1.1574 + {
1.1575 + dst = dstLine;
1.1576 + dstLine += dstStride;
1.1577 + src = srcLine;
1.1578 + srcLine += srcStride;
1.1579 + w = width;
1.1580 +
1.1581 + CHECKPOINT();
1.1582 +
1.1583 + while (w && (unsigned long)dst & 7)
1.1584 + {
1.1585 + __m64 vsrc = load8888 (*src);
1.1586 + ullong d = *dst;
1.1587 + __m64 vdest = expand565 ((__m64)d, 0);
1.1588 +
1.1589 + vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1.1590 +
1.1591 + *dst = (ullong)vdest;
1.1592 +
1.1593 + w--;
1.1594 + dst++;
1.1595 + src++;
1.1596 + }
1.1597 +
1.1598 + CHECKPOINT();
1.1599 +
1.1600 + while (w >= 4)
1.1601 + {
1.1602 + CARD32 s0, s1, s2, s3;
1.1603 + unsigned char a0, a1, a2, a3;
1.1604 +
1.1605 + s0 = *src;
1.1606 + s1 = *(src + 1);
1.1607 + s2 = *(src + 2);
1.1608 + s3 = *(src + 3);
1.1609 +
1.1610 + a0 = (s0 >> 24);
1.1611 + a1 = (s1 >> 24);
1.1612 + a2 = (s2 >> 24);
1.1613 + a3 = (s3 >> 24);
1.1614 +
1.1615 + if ((a0 & a1 & a2 & a3) == 0xFF)
1.1616 + {
1.1617 + __m64 vdest;
1.1618 + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
1.1619 + vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1.1620 + vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1.1621 + vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1.1622 +
1.1623 + *(__m64 *)dst = vdest;
1.1624 + }
1.1625 + else if (a0 | a1 | a2 | a3)
1.1626 + {
1.1627 + __m64 vdest = *(__m64 *)dst;
1.1628 +
1.1629 + vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1.1630 + vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1.1631 + vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1.1632 + vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1.1633 +
1.1634 + *(__m64 *)dst = vdest;
1.1635 + }
1.1636 +
1.1637 + w -= 4;
1.1638 + dst += 4;
1.1639 + src += 4;
1.1640 + }
1.1641 +
1.1642 + CHECKPOINT();
1.1643 +
1.1644 + while (w)
1.1645 + {
1.1646 + __m64 vsrc = load8888 (*src);
1.1647 + ullong d = *dst;
1.1648 + __m64 vdest = expand565 ((__m64)d, 0);
1.1649 +
1.1650 + vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1.1651 +
1.1652 + *dst = (ullong)vdest;
1.1653 +
1.1654 + w--;
1.1655 + dst++;
1.1656 + src++;
1.1657 + }
1.1658 + }
1.1659 +
1.1660 + _mm_empty();
1.1661 +}
1.1662 +
1.1663 +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1.1664 +
1.1665 +void
1.1666 +fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
1.1667 + PicturePtr pSrc,
1.1668 + PicturePtr pMask,
1.1669 + PicturePtr pDst,
1.1670 + INT16 xSrc,
1.1671 + INT16 ySrc,
1.1672 + INT16 xMask,
1.1673 + INT16 yMask,
1.1674 + INT16 xDst,
1.1675 + INT16 yDst,
1.1676 + CARD16 width,
1.1677 + CARD16 height)
1.1678 +{
1.1679 + CARD32 *dstLine, *dst;
1.1680 + CARD32 *srcLine, *src;
1.1681 + FbStride dstStride, srcStride;
1.1682 + CARD16 w;
1.1683 +
1.1684 + CHECKPOINT();
1.1685 +
1.1686 + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1.1687 + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1.1688 +
1.1689 + assert (pSrc->pDrawable == pMask->pDrawable);
1.1690 +
1.1691 + while (height--)
1.1692 + {
1.1693 + dst = dstLine;
1.1694 + dstLine += dstStride;
1.1695 + src = srcLine;
1.1696 + srcLine += srcStride;
1.1697 + w = width;
1.1698 +
1.1699 + while (w && (unsigned long)dst & 7)
1.1700 + {
1.1701 + __m64 s = load8888 (*src);
1.1702 + __m64 d = load8888 (*dst);
1.1703 +
1.1704 + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1.1705 +
1.1706 + w--;
1.1707 + dst++;
1.1708 + src++;
1.1709 + }
1.1710 +
1.1711 + while (w >= 2)
1.1712 + {
1.1713 + ullong s0, s1;
1.1714 + unsigned char a0, a1;
1.1715 + __m64 d0, d1;
1.1716 +
1.1717 + s0 = *src;
1.1718 + s1 = *(src + 1);
1.1719 +
1.1720 + a0 = (s0 >> 24);
1.1721 + a1 = (s1 >> 24);
1.1722 +
1.1723 + if ((a0 & a1) == 0xFF)
1.1724 + {
1.1725 + d0 = invert_colors(load8888(s0));
1.1726 + d1 = invert_colors(load8888(s1));
1.1727 +
1.1728 + *(__m64 *)dst = pack8888 (d0, d1);
1.1729 + }
1.1730 + else if (a0 | a1)
1.1731 + {
1.1732 + __m64 vdest = *(__m64 *)dst;
1.1733 +
1.1734 + d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1.1735 + d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1.1736 +
1.1737 + *(__m64 *)dst = pack8888 (d0, d1);
1.1738 + }
1.1739 +
1.1740 + w -= 2;
1.1741 + dst += 2;
1.1742 + src += 2;
1.1743 + }
1.1744 +
1.1745 + while (w)
1.1746 + {
1.1747 + __m64 s = load8888 (*src);
1.1748 + __m64 d = load8888 (*dst);
1.1749 +
1.1750 + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1.1751 +
1.1752 + w--;
1.1753 + dst++;
1.1754 + src++;
1.1755 + }
1.1756 + }
1.1757 +
1.1758 + _mm_empty();
1.1759 +}
1.1760 +
1.1761 +void
1.1762 +fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
1.1763 + PicturePtr pSrc,
1.1764 + PicturePtr pMask,
1.1765 + PicturePtr pDst,
1.1766 + INT16 xSrc,
1.1767 + INT16 ySrc,
1.1768 + INT16 xMask,
1.1769 + INT16 yMask,
1.1770 + INT16 xDst,
1.1771 + INT16 yDst,
1.1772 + CARD16 width,
1.1773 + CARD16 height)
1.1774 +{
1.1775 + CARD32 src, srca;
1.1776 + CARD16 *dstLine;
1.1777 + CARD32 *maskLine;
1.1778 + FbStride dstStride, maskStride;
1.1779 + __m64 vsrc, vsrca;
1.1780 +
1.1781 + CHECKPOINT();
1.1782 +
1.1783 + fbComposeGetSolid(pSrc, src, pDst->format);
1.1784 +
1.1785 + srca = src >> 24;
1.1786 + if (srca == 0)
1.1787 + return;
1.1788 +
1.1789 + fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1.1790 + fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
1.1791 +
1.1792 + vsrc = load8888 (src);
1.1793 + vsrca = expand_alpha (vsrc);
1.1794 +
1.1795 + while (height--)
1.1796 + {
1.1797 + int twidth = width;
1.1798 + CARD32 *p = (CARD32 *)maskLine;
1.1799 + CARD16 *q = (CARD16 *)dstLine;
1.1800 +
1.1801 + while (twidth && ((unsigned long)q & 7))
1.1802 + {
1.1803 + CARD32 m = *(CARD32 *)p;
1.1804 +
1.1805 + if (m)
1.1806 + {
1.1807 + ullong d = *q;
1.1808 + __m64 vdest = expand565 ((__m64)d, 0);
1.1809 + vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1.1810 + *q = (ullong)vdest;
1.1811 + }
1.1812 +
1.1813 + twidth--;
1.1814 + p++;
1.1815 + q++;
1.1816 + }
1.1817 +
1.1818 + while (twidth >= 4)
1.1819 + {
1.1820 + CARD32 m0, m1, m2, m3;
1.1821 +
1.1822 + m0 = *p;
1.1823 + m1 = *(p + 1);
1.1824 + m2 = *(p + 2);
1.1825 + m3 = *(p + 3);
1.1826 +
1.1827 + if ((m0 | m1 | m2 | m3))
1.1828 + {
1.1829 + __m64 vdest = *(__m64 *)q;
1.1830 +
1.1831 + vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1.1832 + vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1.1833 + vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1.1834 + vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1.1835 +
1.1836 + *(__m64 *)q = vdest;
1.1837 + }
1.1838 + twidth -= 4;
1.1839 + p += 4;
1.1840 + q += 4;
1.1841 + }
1.1842 +
1.1843 + while (twidth)
1.1844 + {
1.1845 + CARD32 m;
1.1846 +
1.1847 + m = *(CARD32 *)p;
1.1848 + if (m)
1.1849 + {
1.1850 + ullong d = *q;
1.1851 + __m64 vdest = expand565((__m64)d, 0);
1.1852 + vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1.1853 + *q = (ullong)vdest;
1.1854 + }
1.1855 +
1.1856 + twidth--;
1.1857 + p++;
1.1858 + q++;
1.1859 + }
1.1860 +
1.1861 + maskLine += maskStride;
1.1862 + dstLine += dstStride;
1.1863 + }
1.1864 +
1.1865 + _mm_empty ();
1.1866 +}
1.1867 +#endif
1.1868 +
1.1869 +static void
1.1870 +fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
1.1871 +{
1.1872 + int s;
1.1873 + int d;
1.1874 + int t;
1.1875 +
1.1876 + while (w && (unsigned long)dst & 7)
1.1877 + {
1.1878 + s = *src;
1.1879 + d = *dst;
1.1880 + t = d + s;
1.1881 + s = t | (0 - (t >> 8));
1.1882 + *dst = s;
1.1883 +
1.1884 + dst++;
1.1885 + src++;
1.1886 + w--;
1.1887 + }
1.1888 +
1.1889 + while (w >= 8)
1.1890 + {
1.1891 + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1.1892 + dst += 8;
1.1893 + src += 8;
1.1894 + w -= 8;
1.1895 + }
1.1896 +
1.1897 + while (w)
1.1898 + {
1.1899 + s = *src;
1.1900 + d = *dst;
1.1901 + t = d + s;
1.1902 + s = t | (0 - (t >> 8));
1.1903 + *dst = s;
1.1904 +
1.1905 + dst++;
1.1906 + src++;
1.1907 + w--;
1.1908 + }
1.1909 +
1.1910 + _mm_empty();
1.1911 +}
1.1912 +OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
1.1913 +
1.1914 +static void
1.1915 +fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
1.1916 +{
1.1917 + while (w && (unsigned long)dst & 7)
1.1918 + {
1.1919 + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1.1920 + _mm_cvtsi32_si64(*dst)));
1.1921 + dst++;
1.1922 + src++;
1.1923 + w--;
1.1924 + }
1.1925 +
1.1926 + while (w >= 2)
1.1927 + {
1.1928 + *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1.1929 + dst += 2;
1.1930 + src += 2;
1.1931 + w -= 2;
1.1932 + }
1.1933 +
1.1934 + if (w)
1.1935 + {
1.1936 + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1.1937 + _mm_cvtsi32_si64(*dst)));
1.1938 +
1.1939 + }
1.1940 +
1.1941 + _mm_empty();
1.1942 +}
1.1943 +OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
1.1944 +
1.1945 +#if 0
1.1946 +#define GetStart(drw,x,y,type,stride,line,bpp) {\
1.1947 + FbBits *__bits__; \
1.1948 + FbStride __stride__; \
1.1949 + int __xoff__,__yoff__; \
1.1950 + \
1.1951 + fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \
1.1952 + (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
1.1953 + (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \
1.1954 +}
1.1955 +
1.1956 +Bool
1.1957 +fbSolidFillmmx (DrawablePtr pDraw,
1.1958 + int x,
1.1959 + int y,
1.1960 + int width,
1.1961 + int height,
1.1962 + FbBits xor)
1.1963 +{
1.1964 + FbStride stride;
1.1965 + int bpp;
1.1966 + ullong fill;
1.1967 + __m64 vfill;
1.1968 + CARD32 byte_width;
1.1969 + CARD8 *byte_line;
1.1970 + FbBits *bits;
1.1971 + int xoff, yoff;
1.1972 +
1.1973 + CHECKPOINT();
1.1974 +
1.1975 + fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1.1976 +
1.1977 + if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1.1978 + return FALSE;
1.1979 +
1.1980 + if (bpp != 16 && bpp != 32)
1.1981 + return FALSE;
1.1982 +
1.1983 + if (bpp == 16)
1.1984 + {
1.1985 + stride = stride * sizeof (FbBits) / 2;
1.1986 + byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
1.1987 + byte_width = 2 * width;
1.1988 + stride *= 2;
1.1989 + }
1.1990 + else
1.1991 + {
1.1992 + stride = stride * sizeof (FbBits) / 4;
1.1993 + byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
1.1994 + byte_width = 4 * width;
1.1995 + stride *= 4;
1.1996 + }
1.1997 +
1.1998 + fill = ((ullong)xor << 32) | xor;
1.1999 + vfill = (__m64)fill;
1.2000 +
1.2001 + while (height--)
1.2002 + {
1.2003 + int w;
1.2004 + CARD8 *d = byte_line;
1.2005 + byte_line += stride;
1.2006 + w = byte_width;
1.2007 +
1.2008 + while (w >= 2 && ((unsigned long)d & 3))
1.2009 + {
1.2010 + *(CARD16 *)d = xor;
1.2011 + w -= 2;
1.2012 + d += 2;
1.2013 + }
1.2014 +
1.2015 + while (w >= 4 && ((unsigned long)d & 7))
1.2016 + {
1.2017 + *(CARD32 *)d = xor;
1.2018 +
1.2019 + w -= 4;
1.2020 + d += 4;
1.2021 + }
1.2022 +
1.2023 + while (w >= 64)
1.2024 + {
1.2025 + *(__m64*) (d + 0) = vfill;
1.2026 + *(__m64*) (d + 8) = vfill;
1.2027 + *(__m64*) (d + 16) = vfill;
1.2028 + *(__m64*) (d + 24) = vfill;
1.2029 + *(__m64*) (d + 32) = vfill;
1.2030 + *(__m64*) (d + 40) = vfill;
1.2031 + *(__m64*) (d + 48) = vfill;
1.2032 + *(__m64*) (d + 56) = vfill;
1.2033 +
1.2034 + w -= 64;
1.2035 + d += 64;
1.2036 + }
1.2037 + while (w >= 4)
1.2038 + {
1.2039 + *(CARD32 *)d = xor;
1.2040 +
1.2041 + w -= 4;
1.2042 + d += 4;
1.2043 + }
1.2044 + if (w >= 2)
1.2045 + {
1.2046 + *(CARD16 *)d = xor;
1.2047 + w -= 2;
1.2048 + d += 2;
1.2049 + }
1.2050 + }
1.2051 +
1.2052 + _mm_empty();
1.2053 + return TRUE;
1.2054 +}
1.2055 +
1.2056 +Bool
1.2057 +fbCopyAreammx (DrawablePtr pSrc,
1.2058 + DrawablePtr pDst,
1.2059 + int src_x,
1.2060 + int src_y,
1.2061 + int dst_x,
1.2062 + int dst_y,
1.2063 + int width,
1.2064 + int height)
1.2065 +{
1.2066 + FbBits * src_bits;
1.2067 + FbStride src_stride;
1.2068 + int src_bpp;
1.2069 + int src_xoff;
1.2070 + int src_yoff;
1.2071 +
1.2072 + FbBits * dst_bits;
1.2073 + FbStride dst_stride;
1.2074 + int dst_bpp;
1.2075 + int dst_xoff;
1.2076 + int dst_yoff;
1.2077 +
1.2078 + CARD8 * src_bytes;
1.2079 + CARD8 * dst_bytes;
1.2080 + int byte_width;
1.2081 +
1.2082 + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
1.2083 + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
1.2084 +
1.2085 + if (src_bpp != 16 && src_bpp != 32)
1.2086 + return FALSE;
1.2087 +
1.2088 + if (dst_bpp != 16 && dst_bpp != 32)
1.2089 + return FALSE;
1.2090 +
1.2091 + if (src_bpp != dst_bpp)
1.2092 + {
1.2093 + return FALSE;
1.2094 + }
1.2095 +
1.2096 + if (src_bpp == 16)
1.2097 + {
1.2098 + src_stride = src_stride * sizeof (FbBits) / 2;
1.2099 + dst_stride = dst_stride * sizeof (FbBits) / 2;
1.2100 + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1.2101 + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1.2102 + byte_width = 2 * width;
1.2103 + src_stride *= 2;
1.2104 + dst_stride *= 2;
1.2105 + }
1.2106 + else
1.2107 + {
1.2108 + src_stride = src_stride * sizeof (FbBits) / 4;
1.2109 + dst_stride = dst_stride * sizeof (FbBits) / 4;
1.2110 + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1.2111 + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1.2112 + byte_width = 4 * width;
1.2113 + src_stride *= 4;
1.2114 + dst_stride *= 4;
1.2115 + }
1.2116 +
1.2117 + while (height--)
1.2118 + {
1.2119 + int w;
1.2120 + CARD8 *s = src_bytes;
1.2121 + CARD8 *d = dst_bytes;
1.2122 + src_bytes += src_stride;
1.2123 + dst_bytes += dst_stride;
1.2124 + w = byte_width;
1.2125 +
1.2126 + while (w >= 2 && ((unsigned long)d & 3))
1.2127 + {
1.2128 + *(CARD16 *)d = *(CARD16 *)s;
1.2129 + w -= 2;
1.2130 + s += 2;
1.2131 + d += 2;
1.2132 + }
1.2133 +
1.2134 + while (w >= 4 && ((unsigned long)d & 7))
1.2135 + {
1.2136 + *(CARD32 *)d = *(CARD32 *)s;
1.2137 +
1.2138 + w -= 4;
1.2139 + s += 4;
1.2140 + d += 4;
1.2141 + }
1.2142 +
1.2143 + while (w >= 64)
1.2144 + {
1.2145 + *(__m64 *)(d + 0) = *(__m64 *)(s + 0);
1.2146 + *(__m64 *)(d + 8) = *(__m64 *)(s + 8);
1.2147 + *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
1.2148 + *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
1.2149 + *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
1.2150 + *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
1.2151 + *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
1.2152 + *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
1.2153 + w -= 64;
1.2154 + s += 64;
1.2155 + d += 64;
1.2156 + }
1.2157 + while (w >= 4)
1.2158 + {
1.2159 + *(CARD32 *)d = *(CARD32 *)s;
1.2160 +
1.2161 + w -= 4;
1.2162 + s += 4;
1.2163 + d += 4;
1.2164 + }
1.2165 + if (w >= 2)
1.2166 + {
1.2167 + *(CARD16 *)d = *(CARD16 *)s;
1.2168 + w -= 2;
1.2169 + s += 2;
1.2170 + d += 2;
1.2171 + }
1.2172 + }
1.2173 +
1.2174 + _mm_empty();
1.2175 + return TRUE;
1.2176 +}
1.2177 +
1.2178 +void
1.2179 +fbCompositeCopyAreammx (CARD8 op,
1.2180 + PicturePtr pSrc,
1.2181 + PicturePtr pMask,
1.2182 + PicturePtr pDst,
1.2183 + INT16 xSrc,
1.2184 + INT16 ySrc,
1.2185 + INT16 xMask,
1.2186 + INT16 yMask,
1.2187 + INT16 xDst,
1.2188 + INT16 yDst,
1.2189 + CARD16 width,
1.2190 + CARD16 height)
1.2191 +{
1.2192 + fbCopyAreammx (pSrc->pDrawable,
1.2193 + pDst->pDrawable,
1.2194 + xSrc, ySrc,
1.2195 + xDst, yDst,
1.2196 + width, height);
1.2197 +}
1.2198 +
1.2199 +#if !defined(__amd64__) && !defined(__x86_64__)
1.2200 +
1.2201 +enum CPUFeatures {
1.2202 + NoFeatures = 0,
1.2203 + MMX = 0x1,
1.2204 + MMX_Extensions = 0x2,
1.2205 + SSE = 0x6,
1.2206 + SSE2 = 0x8,
1.2207 + CMOV = 0x10
1.2208 +};
1.2209 +
1.2210 +static unsigned int detectCPUFeatures(void) {
1.2211 + unsigned int result;
1.2212 + char vendor[13];
1.2213 + vendor[0] = 0;
1.2214 + vendor[12] = 0;
1.2215 + /* see p. 118 of amd64 instruction set manual Vol3 */
1.2216 + __asm__ ("push %%ebx\n"
1.2217 + "pushf\n"
1.2218 + "pop %%eax\n"
1.2219 + "mov %%eax, %%ebx\n"
1.2220 + "xor $0x00200000, %%eax\n"
1.2221 + "push %%eax\n"
1.2222 + "popf\n"
1.2223 + "pushf\n"
1.2224 + "pop %%eax\n"
1.2225 + "mov $0x0, %%edx\n"
1.2226 + "xor %%ebx, %%eax\n"
1.2227 + "jz skip\n"
1.2228 +
1.2229 + "mov $0x00000000, %%eax\n"
1.2230 + "cpuid\n"
1.2231 + "mov %%ebx, %1\n"
1.2232 + "mov %%edx, %2\n"
1.2233 + "mov %%ecx, %3\n"
1.2234 + "mov $0x00000001, %%eax\n"
1.2235 + "cpuid\n"
1.2236 + "skip:\n"
1.2237 + "pop %%ebx\n"
1.2238 + "mov %%edx, %0\n"
1.2239 + : "=r" (result),
1.2240 + "=m" (vendor[0]),
1.2241 + "=m" (vendor[4]),
1.2242 + "=m" (vendor[8])
1.2243 + :
1.2244 + : "%eax", "%ecx", "%edx"
1.2245 + );
1.2246 +
1.2247 + unsigned int features = 0;
1.2248 + if (result) {
1.2249 + /* result now contains the standard feature bits */
1.2250 + if (result & (1 << 15))
1.2251 + features |= CMOV;
1.2252 + if (result & (1 << 23))
1.2253 + features |= MMX;
1.2254 + if (result & (1 << 25))
1.2255 + features |= SSE;
1.2256 + if (result & (1 << 26))
1.2257 + features |= SSE2;
1.2258 + if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
1.2259 + /* check for AMD MMX extensions */
1.2260 +
1.2261 + unsigned int result;
1.2262 + __asm__("push %%ebx\n"
1.2263 + "mov $0x80000000, %%eax\n"
1.2264 + "cpuid\n"
1.2265 + "xor %%edx, %%edx\n"
1.2266 + "cmp $0x1, %%eax\n"
1.2267 + "jge skip2\n"
1.2268 + "mov $0x80000001, %%eax\n"
1.2269 + "cpuid\n"
1.2270 + "skip2:\n"
1.2271 + "mov %%edx, %0\n"
1.2272 + "pop %%ebx\n"
1.2273 + : "=r" (result)
1.2274 + :
1.2275 + : "%eax", "%ecx", "%edx"
1.2276 + );
1.2277 + if (result & (1<<22))
1.2278 + features |= MMX_Extensions;
1.2279 + }
1.2280 + }
1.2281 + return features;
1.2282 +}
1.2283 +
1.2284 +Bool
1.2285 +fbHaveMMX (void)
1.2286 +{
1.2287 + static Bool initialized = FALSE;
1.2288 + static Bool mmx_present;
1.2289 +
1.2290 + if (!initialized)
1.2291 + {
1.2292 + unsigned int features = detectCPUFeatures();
1.2293 + mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
1.2294 + initialized = TRUE;
1.2295 + }
1.2296 +
1.2297 + return mmx_present;
1.2298 +}
1.2299 +#endif /* __amd64__ */
1.2300 +
1.2301 +
1.2302 +#endif
1.2303 +
1.2304 +
1.2305 +#ifdef __SYMBIAN32__
1.2306 +
1.2307 +OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() {
1.2308 + return &_oil_function_impl_mmxCombineOverU, composite_over_argb;
1.2309 +}
1.2310 +#endif
1.2311 +
1.2312 +#ifdef __SYMBIAN32__
1.2313 +
1.2314 +OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() {
1.2315 + return &_oil_function_impl_mmxCombineAddU, composite_add_argb;
1.2316 +}
1.2317 +#endif
1.2318 +
1.2319 +#ifdef __SYMBIAN32__
1.2320 +
1.2321 +OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() {
1.2322 + return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src;
1.2323 +}
1.2324 +#endif
1.2325 +
1.2326 +#ifdef __SYMBIAN32__
1.2327 +
1.2328 +OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() {
1.2329 + return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8;
1.2330 +}
1.2331 +#endif
1.2332 +
1.2333 +#ifdef __SYMBIAN32__
1.2334 +
1.2335 +OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() {
1.2336 + return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb;
1.2337 +}
1.2338 +#endif
1.2339 +