os/ossrv/genericopenlibs/liboil/src/fb/fbmmx.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 * Copyright © 2004 Red Hat, Inc.
sl@0
     3
 * Copyright © 2004 Nicholas Miell
sl@0
     4
 * Copyright © 2005 Trolltech AS
sl@0
     5
 *
sl@0
     6
 * Permission to use, copy, modify, distribute, and sell this software and its
sl@0
     7
 * documentation for any purpose is hereby granted without fee, provided that
sl@0
     8
 * the above copyright notice appear in all copies and that both that
sl@0
     9
 * copyright notice and this permission notice appear in supporting
sl@0
    10
 * documentation, and that the name of Red Hat not be used in advertising or
sl@0
    11
 * publicity pertaining to distribution of the software without specific,
sl@0
    12
 * written prior permission.  Red Hat makes no representations about the
sl@0
    13
 * suitability of this software for any purpose.  It is provided "as is"
sl@0
    14
 * without express or implied warranty.
sl@0
    15
 *
sl@0
    16
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
sl@0
    17
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
sl@0
    18
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
sl@0
    19
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
sl@0
    20
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
sl@0
    21
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
sl@0
    22
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
sl@0
    23
 * SOFTWARE.
sl@0
    24
 *
sl@0
    25
 * Author:  Søren Sandmann (sandmann@redhat.com)
sl@0
    26
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
sl@0
    27
 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 
sl@0
    28
 *
sl@0
    29
 * Based on work by Owen Taylor
sl@0
    30
 */
sl@0
    31
//Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
sl@0
    32
sl@0
    33
#ifdef HAVE_CONFIG_H
sl@0
    34
#include "config.h"
sl@0
    35
#endif
sl@0
    36
sl@0
    37
#include <liboil/liboil.h>
sl@0
    38
#include <liboil/liboilfunction.h>
sl@0
    39
 
sl@0
    40
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
sl@0
    41
sl@0
    42
typedef uint32_t CARD32;
sl@0
    43
typedef uint16_t CARD16;
sl@0
    44
typedef int16_t INT16;
sl@0
    45
typedef uint8_t CARD8;
sl@0
    46
typedef uint64_t ullong;
sl@0
    47
typedef CARD32* PicturePtr;
sl@0
    48
typedef CARD32* FbBits;
sl@0
    49
typedef int FbStride;
sl@0
    50
sl@0
    51
sl@0
    52
#include "fbmmx.h"
sl@0
    53
#include "fbpict.h"
sl@0
    54
sl@0
    55
#define CHECKPOINT()
sl@0
    56
sl@0
    57
OIL_DECLARE_CLASS (composite_in_argb);
sl@0
    58
OIL_DECLARE_CLASS (composite_in_argb_const_src);
sl@0
    59
OIL_DECLARE_CLASS (composite_in_argb_const_mask);
sl@0
    60
OIL_DECLARE_CLASS (composite_over_argb);
sl@0
    61
OIL_DECLARE_CLASS (composite_over_argb_const_src);
sl@0
    62
OIL_DECLARE_CLASS (composite_add_argb);
sl@0
    63
OIL_DECLARE_CLASS (composite_add_argb_const_src);
sl@0
    64
OIL_DECLARE_CLASS (composite_in_over_argb);
sl@0
    65
OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
sl@0
    66
OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
sl@0
    67
OIL_DECLARE_CLASS (composite_over_u8);
sl@0
    68
OIL_DECLARE_CLASS (composite_add_u8);
sl@0
    69
sl@0
    70
sl@0
    71
/* --------------- MMX code patch for fbcompose.c --------------------- */
sl@0
    72
sl@0
    73
#if 0
sl@0
    74
static void
sl@0
    75
mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
sl@0
    76
{
sl@0
    77
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
    78
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
    79
    
sl@0
    80
    const uint32_t *end = mask + width;
sl@0
    81
    while (mask < end) {
sl@0
    82
        __m64 a = MmxTo(*mask);
sl@0
    83
        __m64 s = MmxTo(*src);
sl@0
    84
        a = MmxAlpha(a);
sl@0
    85
        MmxMul(s, a);
sl@0
    86
        *dest = MmxFrom(s);
sl@0
    87
        ++src;
sl@0
    88
        ++dest;
sl@0
    89
        ++mask;
sl@0
    90
    }
sl@0
    91
    _mm_empty();
sl@0
    92
}
sl@0
    93
#endif
sl@0
    94
sl@0
    95
#ifdef ENABLE_BROKEN_IMPLS
sl@0
    96
static void
sl@0
    97
mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
sl@0
    98
{
sl@0
    99
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   100
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   101
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   102
sl@0
   103
    const uint32_t *end = dest + width;
sl@0
   104
sl@0
   105
    while (dest < end) {
sl@0
   106
        __m64 x, y, a;
sl@0
   107
        x = MmxTo(*src);
sl@0
   108
        y = MmxTo(*dest);
sl@0
   109
        a = MmxAlpha(x);
sl@0
   110
        a = MmxNegate(a);
sl@0
   111
        MmxMulAdd(y, a, x);
sl@0
   112
        *dest = MmxFrom(y);
sl@0
   113
        ++dest;
sl@0
   114
        ++src;
sl@0
   115
    }
sl@0
   116
    _mm_empty();
sl@0
   117
}
sl@0
   118
OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
sl@0
   119
#endif
sl@0
   120
sl@0
   121
#if 0
sl@0
   122
static FASTCALL void
sl@0
   123
mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   124
{
sl@0
   125
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   126
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   127
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   128
sl@0
   129
    const CARD32 *end = dest + width;
sl@0
   130
sl@0
   131
    while (dest < end) {
sl@0
   132
        __m64 x, y, a;
sl@0
   133
        x = MmxTo(*dest);
sl@0
   134
        y = MmxTo(*src);
sl@0
   135
        a = MmxAlpha(x);
sl@0
   136
        a = MmxNegate(a);
sl@0
   137
        MmxMulAdd(y, a, x);
sl@0
   138
        *dest = MmxFrom(y);
sl@0
   139
        ++dest;
sl@0
   140
        ++src;
sl@0
   141
    }
sl@0
   142
    _mm_empty();
sl@0
   143
}
sl@0
   144
#endif
sl@0
   145
sl@0
   146
#if 0
sl@0
   147
static void
sl@0
   148
mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   149
{
sl@0
   150
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   151
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   152
sl@0
   153
    const CARD32 *end = dest + width;
sl@0
   154
sl@0
   155
    while (dest < end) {
sl@0
   156
        __m64 x, a;
sl@0
   157
        x = MmxTo(*src);
sl@0
   158
        a = MmxTo(*dest);
sl@0
   159
        a = MmxAlpha(a);
sl@0
   160
        MmxMul(x, a);
sl@0
   161
        *dest = MmxFrom(x);
sl@0
   162
        ++dest;
sl@0
   163
        ++src;
sl@0
   164
    }
sl@0
   165
    _mm_empty();
sl@0
   166
}
sl@0
   167
#endif
sl@0
   168
sl@0
   169
#if 0
sl@0
   170
static FASTCALL void
sl@0
   171
mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   172
{
sl@0
   173
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   174
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   175
sl@0
   176
    const CARD32 *end = dest + width;
sl@0
   177
sl@0
   178
    while (dest < end) {
sl@0
   179
        __m64 x, a;
sl@0
   180
        x = MmxTo(*dest);
sl@0
   181
        a = MmxTo(*src);
sl@0
   182
        a = MmxAlpha(a);
sl@0
   183
        MmxMul(x, a);
sl@0
   184
        *dest = MmxFrom(x);
sl@0
   185
        ++dest;
sl@0
   186
        ++src;
sl@0
   187
    }
sl@0
   188
    _mm_empty();
sl@0
   189
}
sl@0
   190
#endif
sl@0
   191
sl@0
   192
#if 0
sl@0
   193
static FASTCALL void
sl@0
   194
mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   195
{
sl@0
   196
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   197
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   198
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   199
sl@0
   200
    const CARD32 *end = dest + width;
sl@0
   201
sl@0
   202
    while (dest < end) {
sl@0
   203
        __m64 x, a;
sl@0
   204
        x = MmxTo(*src);
sl@0
   205
        a = MmxTo(*dest);
sl@0
   206
        a = MmxAlpha(a);
sl@0
   207
        a = MmxNegate(a);
sl@0
   208
        MmxMul(x, a);
sl@0
   209
        *dest = MmxFrom(x);
sl@0
   210
        ++dest;
sl@0
   211
        ++src;
sl@0
   212
    }
sl@0
   213
    _mm_empty();
sl@0
   214
}
sl@0
   215
#endif
sl@0
   216
sl@0
   217
#if 0
sl@0
   218
static FASTCALL void
sl@0
   219
mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   220
{
sl@0
   221
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   222
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   223
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   224
sl@0
   225
    const CARD32 *end = dest + width;
sl@0
   226
sl@0
   227
    while (dest < end) {
sl@0
   228
        __m64 x, a;
sl@0
   229
        x = MmxTo(*dest);
sl@0
   230
        a = MmxTo(*src);
sl@0
   231
        a = MmxAlpha(a);
sl@0
   232
        a = MmxNegate(a);
sl@0
   233
        MmxMul(x, a);
sl@0
   234
        *dest = MmxFrom(x);
sl@0
   235
        ++dest;
sl@0
   236
        ++src;
sl@0
   237
    }
sl@0
   238
    _mm_empty();
sl@0
   239
}
sl@0
   240
sl@0
   241
static FASTCALL void
sl@0
   242
mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   243
{
sl@0
   244
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   245
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   246
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   247
sl@0
   248
    const CARD32 *end = dest + width;
sl@0
   249
sl@0
   250
    while (dest < end) {
sl@0
   251
        __m64 s, da, d, sia;
sl@0
   252
        s = MmxTo(*src);
sl@0
   253
        d = MmxTo(*dest);
sl@0
   254
        sia = MmxAlpha(s);
sl@0
   255
        sia = MmxNegate(sia);
sl@0
   256
        da = MmxAlpha(d);
sl@0
   257
        MmxAddMul(s, da, d, sia);
sl@0
   258
        *dest = MmxFrom(s);
sl@0
   259
        ++dest;
sl@0
   260
        ++src;
sl@0
   261
    }
sl@0
   262
    _mm_empty();
sl@0
   263
}
sl@0
   264
sl@0
   265
static FASTCALL void
sl@0
   266
mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   267
{
sl@0
   268
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   269
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   270
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   271
sl@0
   272
    const CARD32 *end;
sl@0
   273
sl@0
   274
    end = dest + width;
sl@0
   275
sl@0
   276
    while (dest < end) {
sl@0
   277
        __m64 s, dia, d, sa;
sl@0
   278
        s = MmxTo(*src);
sl@0
   279
        d = MmxTo(*dest);
sl@0
   280
        sa = MmxAlpha(s);
sl@0
   281
        dia = MmxAlpha(d);
sl@0
   282
        dia = MmxNegate(dia);
sl@0
   283
        MmxAddMul(s, dia, d, sa);
sl@0
   284
        *dest = MmxFrom(s);
sl@0
   285
        ++dest;
sl@0
   286
        ++src;
sl@0
   287
    }
sl@0
   288
    _mm_empty();
sl@0
   289
}
sl@0
   290
sl@0
   291
static FASTCALL void
sl@0
   292
mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   293
{
sl@0
   294
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   295
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   296
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   297
sl@0
   298
    const CARD32 *end = dest + width;
sl@0
   299
sl@0
   300
    while (dest < end) {
sl@0
   301
        __m64 s, dia, d, sia;
sl@0
   302
        s = MmxTo(*src);
sl@0
   303
        d = MmxTo(*dest);
sl@0
   304
        sia = MmxAlpha(s);
sl@0
   305
        dia = MmxAlpha(d);
sl@0
   306
        sia = MmxNegate(sia);
sl@0
   307
        dia = MmxNegate(dia);
sl@0
   308
        MmxAddMul(s, dia, d, sia);
sl@0
   309
        *dest = MmxFrom(s);
sl@0
   310
        ++dest;
sl@0
   311
        ++src;
sl@0
   312
    }
sl@0
   313
    _mm_empty();
sl@0
   314
}
sl@0
   315
#endif
sl@0
   316
sl@0
   317
static void
sl@0
   318
mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
sl@0
   319
{
sl@0
   320
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   321
sl@0
   322
    const uint32_t *end = dest + width;
sl@0
   323
    while (dest < end) {
sl@0
   324
        __m64 s, d;
sl@0
   325
        s = MmxTo(*src);
sl@0
   326
        d = MmxTo(*dest);
sl@0
   327
        s = MmxAdd(s, d);
sl@0
   328
        *dest = MmxFrom(s);
sl@0
   329
        ++dest;
sl@0
   330
        ++src;
sl@0
   331
    }
sl@0
   332
    _mm_empty();
sl@0
   333
}
sl@0
   334
OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
sl@0
   335
sl@0
   336
#if 0
sl@0
   337
static FASTCALL void
sl@0
   338
mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
sl@0
   339
{
sl@0
   340
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   341
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   342
sl@0
   343
    const CARD32 *end = dest + width;
sl@0
   344
    while (dest < end) {
sl@0
   345
        CARD32 s = *src;
sl@0
   346
        CARD32 d = *dest;
sl@0
   347
        __m64 ms = MmxTo(s);
sl@0
   348
        __m64 md = MmxTo(d);
sl@0
   349
        CARD32 sa = s >> 24;
sl@0
   350
        CARD32 da = ~d >> 24;
sl@0
   351
sl@0
   352
        if (sa > da) {
sl@0
   353
            __m64 msa = MmxTo(FbIntDiv(da, sa));
sl@0
   354
            msa = MmxAlpha(msa);
sl@0
   355
            MmxMul(ms, msa);
sl@0
   356
        }
sl@0
   357
        MmxAdd(md, ms);
sl@0
   358
        *dest = MmxFrom(md);
sl@0
   359
        ++src;
sl@0
   360
        ++dest;
sl@0
   361
    }
sl@0
   362
    _mm_empty();
sl@0
   363
}
sl@0
   364
sl@0
   365
sl@0
   366
static FASTCALL void
sl@0
   367
mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   368
{
sl@0
   369
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   370
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   371
sl@0
   372
    const CARD32 *end = src + width;
sl@0
   373
    while (src < end) {
sl@0
   374
        __m64 a = MmxTo(*mask);
sl@0
   375
        __m64 s = MmxTo(*src);
sl@0
   376
        MmxMul(s, a);
sl@0
   377
        *dest = MmxFrom(s);
sl@0
   378
        ++src;
sl@0
   379
        ++mask;
sl@0
   380
        ++dest;
sl@0
   381
    }
sl@0
   382
    _mm_empty();
sl@0
   383
}
sl@0
   384
sl@0
   385
static FASTCALL void
sl@0
   386
mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   387
{
sl@0
   388
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   389
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   390
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   391
    
sl@0
   392
    const CARD32 *end = src + width;
sl@0
   393
    while (src < end) {
sl@0
   394
        __m64 a = MmxTo(*mask);
sl@0
   395
        __m64 s = MmxTo(*src);
sl@0
   396
        __m64 d = MmxTo(*dest);
sl@0
   397
        __m64 sa = MmxAlpha(s);
sl@0
   398
        MmxMul(s, a);
sl@0
   399
        MmxMul(a, sa);
sl@0
   400
        a = MmxNegate(a);
sl@0
   401
        MmxMulAdd(d, a, s);
sl@0
   402
        *dest = MmxFrom(d);
sl@0
   403
        ++src;
sl@0
   404
        ++dest;
sl@0
   405
        ++mask;
sl@0
   406
    }
sl@0
   407
    _mm_empty();
sl@0
   408
}
sl@0
   409
sl@0
   410
static FASTCALL void
sl@0
   411
mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   412
{
sl@0
   413
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   414
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   415
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   416
    
sl@0
   417
    const CARD32 *end = src + width;
sl@0
   418
    while (src < end) {
sl@0
   419
        __m64 a = MmxTo(*mask);
sl@0
   420
        __m64 s = MmxTo(*src);
sl@0
   421
        __m64 d = MmxTo(*dest);
sl@0
   422
        __m64 da = MmxAlpha(d);
sl@0
   423
        da = MmxNegate(da);
sl@0
   424
        MmxMul(s, a);
sl@0
   425
        MmxMulAdd(s, da, d);
sl@0
   426
        *dest = MmxFrom(s);
sl@0
   427
        ++src;
sl@0
   428
        ++dest;
sl@0
   429
        ++mask;
sl@0
   430
    }
sl@0
   431
    _mm_empty();
sl@0
   432
}
sl@0
   433
sl@0
   434
sl@0
   435
static FASTCALL void
sl@0
   436
mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   437
{
sl@0
   438
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   439
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   440
    
sl@0
   441
    const CARD32 *end = src + width;
sl@0
   442
    while (src < end) {
sl@0
   443
        __m64 a = MmxTo(*mask);
sl@0
   444
        __m64 s = MmxTo(*src);
sl@0
   445
        __m64 d = MmxTo(*dest);
sl@0
   446
        __m64 da = MmxAlpha(d);
sl@0
   447
        MmxMul(s, a);
sl@0
   448
        MmxMul(s, da);
sl@0
   449
        *dest = MmxFrom(s);
sl@0
   450
        ++src;
sl@0
   451
        ++dest;
sl@0
   452
        ++mask;
sl@0
   453
    }
sl@0
   454
    _mm_empty();
sl@0
   455
}
sl@0
   456
sl@0
   457
static FASTCALL void
sl@0
   458
mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   459
{
sl@0
   460
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   461
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   462
    
sl@0
   463
    const CARD32 *end = src + width;
sl@0
   464
    while (src < end) {
sl@0
   465
        __m64 a = MmxTo(*mask);
sl@0
   466
        __m64 s = MmxTo(*src);
sl@0
   467
        __m64 d = MmxTo(*dest);
sl@0
   468
        __m64 sa = MmxAlpha(s);
sl@0
   469
        MmxMul(a, sa);
sl@0
   470
        MmxMul(d, a);
sl@0
   471
        *dest = MmxFrom(d);
sl@0
   472
        ++src;
sl@0
   473
        ++dest;
sl@0
   474
        ++mask;
sl@0
   475
    }
sl@0
   476
    _mm_empty();
sl@0
   477
}
sl@0
   478
sl@0
   479
static FASTCALL void
sl@0
   480
mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   481
{
sl@0
   482
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   483
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   484
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   485
    
sl@0
   486
    const CARD32 *end = src + width;
sl@0
   487
    while (src < end) {
sl@0
   488
        __m64 a = MmxTo(*mask);
sl@0
   489
        __m64 s = MmxTo(*src);
sl@0
   490
        __m64 d = MmxTo(*dest);
sl@0
   491
        __m64 da = MmxAlpha(d);
sl@0
   492
        da = MmxNegate(da);
sl@0
   493
        MmxMul(s, a);
sl@0
   494
        MmxMul(s, da);
sl@0
   495
        *dest = MmxFrom(s);
sl@0
   496
        ++src;
sl@0
   497
        ++dest;
sl@0
   498
        ++mask;
sl@0
   499
    }
sl@0
   500
    _mm_empty();
sl@0
   501
}
sl@0
   502
sl@0
   503
static FASTCALL void
sl@0
   504
mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   505
{
sl@0
   506
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   507
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   508
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   509
    
sl@0
   510
    const CARD32 *end = src + width;
sl@0
   511
    while (src < end) {
sl@0
   512
        __m64 a = MmxTo(*mask);
sl@0
   513
        __m64 s = MmxTo(*src);
sl@0
   514
        __m64 d = MmxTo(*dest);
sl@0
   515
        __m64 sa = MmxAlpha(s);
sl@0
   516
        MmxMul(a, sa);
sl@0
   517
        a = MmxNegate(a);
sl@0
   518
        MmxMul(d, a);
sl@0
   519
        *dest = MmxFrom(d);
sl@0
   520
        ++src;
sl@0
   521
        ++dest;
sl@0
   522
        ++mask;
sl@0
   523
    }
sl@0
   524
    _mm_empty();
sl@0
   525
}
sl@0
   526
sl@0
   527
static FASTCALL void
sl@0
   528
mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   529
{
sl@0
   530
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   531
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   532
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   533
    
sl@0
   534
    const CARD32 *end = src + width;
sl@0
   535
    while (src < end) {
sl@0
   536
        __m64 a = MmxTo(*mask);
sl@0
   537
        __m64 s = MmxTo(*src);
sl@0
   538
        __m64 d = MmxTo(*dest);
sl@0
   539
        __m64 da = MmxAlpha(d);
sl@0
   540
        __m64 sa = MmxAlpha(s); 
sl@0
   541
        MmxMul(s, a);
sl@0
   542
        MmxMul(a, sa);
sl@0
   543
        a = MmxNegate(a);
sl@0
   544
        MmxAddMul(d, a, s, da);
sl@0
   545
        *dest = MmxFrom(d);
sl@0
   546
        ++src;
sl@0
   547
        ++dest;
sl@0
   548
        ++mask;
sl@0
   549
    }
sl@0
   550
    _mm_empty();
sl@0
   551
}
sl@0
   552
sl@0
   553
static FASTCALL void
sl@0
   554
mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   555
{
sl@0
   556
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   557
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   558
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   559
    
sl@0
   560
    const CARD32 *end = src + width;
sl@0
   561
    while (src < end) {
sl@0
   562
        __m64 a = MmxTo(*mask);
sl@0
   563
        __m64 s = MmxTo(*src);
sl@0
   564
        __m64 d = MmxTo(*dest);
sl@0
   565
        __m64 da = MmxAlpha(d);
sl@0
   566
        __m64 sa = MmxAlpha(s)
sl@0
   567
        MmxMul(s, a);
sl@0
   568
        MmxMul(a, sa);
sl@0
   569
        da = MmxNegate(da);
sl@0
   570
        MmxAddMul(d, a, s, da);
sl@0
   571
        *dest = MmxFrom(d);
sl@0
   572
        ++src;
sl@0
   573
        ++dest;
sl@0
   574
        ++mask;
sl@0
   575
    }
sl@0
   576
    _mm_empty();
sl@0
   577
}
sl@0
   578
sl@0
   579
static FASTCALL void
sl@0
   580
mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   581
{
sl@0
   582
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   583
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   584
    const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
sl@0
   585
    
sl@0
   586
    const CARD32 *end = src + width;
sl@0
   587
    while (src < end) {
sl@0
   588
        __m64 a = MmxTo(*mask);
sl@0
   589
        __m64 s = MmxTo(*src);
sl@0
   590
        __m64 d = MmxTo(*dest);
sl@0
   591
        __m64 da = MmxAlpha(d);
sl@0
   592
        __m64 sa = MmxAlpha(s);
sl@0
   593
        MmxMul(s, a);
sl@0
   594
        MmxMul(a, sa);
sl@0
   595
        da = MmxNegate(da);
sl@0
   596
        a = MmxNegate(a);
sl@0
   597
        MmxAddMul(d, a, s, da);
sl@0
   598
        *dest = MmxFrom(d);
sl@0
   599
        ++src;
sl@0
   600
        ++dest;
sl@0
   601
        ++mask;
sl@0
   602
    }
sl@0
   603
    _mm_empty();
sl@0
   604
}
sl@0
   605
sl@0
   606
static FASTCALL void
sl@0
   607
mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
sl@0
   608
{
sl@0
   609
    const __m64 mmx_0 = _mm_setzero_si64();
sl@0
   610
    const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
sl@0
   611
    
sl@0
   612
    const CARD32 *end = src + width;
sl@0
   613
    while (src < end) {
sl@0
   614
        __m64 a = MmxTo(*mask);
sl@0
   615
        __m64 s = MmxTo(*src);
sl@0
   616
        __m64 d = MmxTo(*dest);
sl@0
   617
        MmxMul(s, a);
sl@0
   618
        d = MmxAdd(s, d);
sl@0
   619
        *dest = MmxFrom(d);
sl@0
   620
        ++src;
sl@0
   621
        ++dest;
sl@0
   622
        ++mask;
sl@0
   623
    }
sl@0
   624
    _mm_empty();
sl@0
   625
}
sl@0
   626
sl@0
   627
extern FbComposeFunctions composeFunctions;
sl@0
   628
sl@0
   629
void fbComposeSetupMMX(void)
sl@0
   630
{
sl@0
   631
    /* check if we have MMX support and initialize accordingly */
sl@0
   632
    if (fbHaveMMX()) {
sl@0
   633
        composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
sl@0
   634
        composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
sl@0
   635
        composeFunctions.combineU[PictOpIn] = mmxCombineInU;
sl@0
   636
        composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
sl@0
   637
        composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
sl@0
   638
        composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
sl@0
   639
        composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
sl@0
   640
        composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
sl@0
   641
        composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
sl@0
   642
        composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
sl@0
   643
        composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
sl@0
   644
sl@0
   645
        composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
sl@0
   646
        composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
sl@0
   647
        composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
sl@0
   648
        composeFunctions.combineC[PictOpIn] = mmxCombineInC;
sl@0
   649
        composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
sl@0
   650
        composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
sl@0
   651
        composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
sl@0
   652
        composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
sl@0
   653
        composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
sl@0
   654
        composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
sl@0
   655
        composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
sl@0
   656
sl@0
   657
        composeFunctions.combineMaskU = mmxCombineMaskU;
sl@0
   658
    } 
sl@0
   659
}
sl@0
   660
#endif
sl@0
   661
sl@0
   662
sl@0
   663
/* ------------------ MMX code paths called from fbpict.c ----------------------- */
sl@0
   664
sl@0
   665
typedef union {
sl@0
   666
  __m64 m64;
sl@0
   667
  uint64_t ull;
sl@0
   668
} m64_ull;
sl@0
   669
sl@0
   670
typedef struct
sl@0
   671
{
sl@0
   672
    m64_ull mmx_4x00ff;
sl@0
   673
    m64_ull mmx_4x0080;
sl@0
   674
    m64_ull mmx_565_rgb;
sl@0
   675
    m64_ull mmx_565_unpack_multiplier;
sl@0
   676
    m64_ull mmx_565_r;
sl@0
   677
    m64_ull mmx_565_g;
sl@0
   678
    m64_ull mmx_565_b;
sl@0
   679
    m64_ull mmx_mask_0;
sl@0
   680
    m64_ull mmx_mask_1;
sl@0
   681
    m64_ull mmx_mask_2;
sl@0
   682
    m64_ull mmx_mask_3;
sl@0
   683
    m64_ull mmx_full_alpha;
sl@0
   684
    m64_ull mmx_ffff0000ffff0000;
sl@0
   685
    m64_ull mmx_0000ffff00000000;
sl@0
   686
    m64_ull mmx_000000000000ffff;
sl@0
   687
} MMXData;
sl@0
   688
sl@0
   689
static const MMXData c =
sl@0
   690
{
sl@0
   691
    .mmx_4x00ff.ull =			0x00ff00ff00ff00ffULL,
sl@0
   692
    .mmx_4x0080.ull =			0x0080008000800080ULL,
sl@0
   693
    .mmx_565_rgb.ull =			0x000001f0003f001fULL,
sl@0
   694
    .mmx_565_r.ull =			0x000000f800000000ULL,
sl@0
   695
    .mmx_565_g.ull =			0x0000000000fc0000ULL,
sl@0
   696
    .mmx_565_b.ull =			0x00000000000000f8ULL,
sl@0
   697
    .mmx_mask_0.ull =			0xffffffffffff0000ULL,
sl@0
   698
    .mmx_mask_1.ull =			0xffffffff0000ffffULL,
sl@0
   699
    .mmx_mask_2.ull =			0xffff0000ffffffffULL,
sl@0
   700
    .mmx_mask_3.ull =			0x0000ffffffffffffULL,
sl@0
   701
    .mmx_full_alpha.ull =			0x00ff000000000000ULL,
sl@0
   702
    .mmx_565_unpack_multiplier.ull =	0x0000008404100840ULL,
sl@0
   703
    .mmx_ffff0000ffff0000.ull =		0xffff0000ffff0000ULL,
sl@0
   704
    .mmx_0000ffff00000000.ull =		0x0000ffff00000000ULL,
sl@0
   705
    .mmx_000000000000ffff.ull =		0x000000000000ffffULL,
sl@0
   706
};
sl@0
   707
sl@0
   708
#define MC(x) ((__m64) c.mmx_##x.m64)
sl@0
   709
sl@0
   710
static __inline__ __m64
sl@0
   711
shift (__m64 v, int s)
sl@0
   712
{
sl@0
   713
    if (s > 0)
sl@0
   714
	return _mm_slli_si64 (v, s);
sl@0
   715
    else if (s < 0)
sl@0
   716
	return _mm_srli_si64 (v, -s);
sl@0
   717
    else
sl@0
   718
	return v;
sl@0
   719
}
sl@0
   720
sl@0
   721
static __inline__ __m64
sl@0
   722
negate (__m64 mask)
sl@0
   723
{
sl@0
   724
    return _mm_xor_si64 (mask, MC(4x00ff));
sl@0
   725
}
sl@0
   726
sl@0
   727
static __inline__ __m64
sl@0
   728
pix_multiply (__m64 a, __m64 b)
sl@0
   729
{
sl@0
   730
    __m64 res;
sl@0
   731
    
sl@0
   732
    res = _mm_mullo_pi16 (a, b);
sl@0
   733
    res = _mm_adds_pu16 (res, MC(4x0080));
sl@0
   734
    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
sl@0
   735
    res = _mm_srli_pi16 (res, 8);
sl@0
   736
    
sl@0
   737
    return res;
sl@0
   738
}
sl@0
   739
sl@0
   740
static __inline__ __m64
sl@0
   741
expand_alpha (__m64 pixel)
sl@0
   742
{
sl@0
   743
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
sl@0
   744
}
sl@0
   745
sl@0
   746
static __inline__ __m64
sl@0
   747
expand_alpha_rev (__m64 pixel)
sl@0
   748
{
sl@0
   749
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
sl@0
   750
}    
sl@0
   751
sl@0
   752
static __inline__ __m64
sl@0
   753
invert_colors (__m64 pixel)
sl@0
   754
{
sl@0
   755
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
sl@0
   756
}
sl@0
   757
sl@0
   758
/* Notes about writing mmx code
sl@0
   759
 *
sl@0
   760
 * give memory operands as the second operand. If you give it as the
sl@0
   761
 * first, gcc will first load it into a register, then use that
sl@0
   762
 * register
sl@0
   763
 *
sl@0
   764
 *   ie. use
sl@0
   765
 *
sl@0
   766
 *         _mm_mullo_pi16 (x, mmx_constant);
sl@0
   767
 *
sl@0
   768
 *   not
sl@0
   769
 *
sl@0
   770
 *         _mm_mullo_pi16 (mmx_constant, x);
sl@0
   771
 *
sl@0
   772
 * Also try to minimize dependencies. i.e. when you need a value, try
sl@0
   773
 * to calculate it from a value that was calculated as early as
sl@0
   774
 * possible.
sl@0
   775
 */
sl@0
   776
sl@0
   777
static __inline__ __m64
sl@0
   778
over (__m64 src, __m64 srca, __m64 dest)
sl@0
   779
{
sl@0
   780
    return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
sl@0
   781
}
sl@0
   782
sl@0
   783
static __inline__ __m64
sl@0
   784
over_rev_non_pre (__m64 src, __m64 dest)
sl@0
   785
{
sl@0
   786
    __m64 srca = expand_alpha (src);
sl@0
   787
    __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
sl@0
   788
    
sl@0
   789
    return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
sl@0
   790
}
sl@0
   791
sl@0
   792
static __inline__ __m64
sl@0
   793
in (__m64 src,
sl@0
   794
    __m64 mask)
sl@0
   795
{
sl@0
   796
    return pix_multiply (src, mask);
sl@0
   797
}
sl@0
   798
sl@0
   799
static __inline__ __m64
sl@0
   800
in_over (__m64 src,
sl@0
   801
	 __m64 srca,
sl@0
   802
	 __m64 mask,
sl@0
   803
	 __m64 dest)
sl@0
   804
{
sl@0
   805
    return over(in(src, mask), pix_multiply(srca, mask), dest);
sl@0
   806
}
sl@0
   807
sl@0
   808
static __inline__ __m64
sl@0
   809
load8888 (CARD32 v)
sl@0
   810
{
sl@0
   811
    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
sl@0
   812
}
sl@0
   813
sl@0
   814
static __inline__ __m64
sl@0
   815
pack8888 (__m64 lo, __m64 hi)
sl@0
   816
{
sl@0
   817
    __m64 r;
sl@0
   818
    r = _mm_packs_pu16 (lo, hi);
sl@0
   819
    return r;
sl@0
   820
}
sl@0
   821
sl@0
   822
static __inline__ CARD32
sl@0
   823
store8888 (__m64 v)
sl@0
   824
{
sl@0
   825
    return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
sl@0
   826
}
sl@0
   827
sl@0
   828
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
sl@0
   829
 *
sl@0
   830
 *    00RR00GG00BB
sl@0
   831
 * 
sl@0
   832
 * --- Expanding 565 in the low word ---
sl@0
   833
 * 
sl@0
   834
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
sl@0
   835
 * m = m & (01f0003f001f);
sl@0
   836
 * m = m * (008404100840);
sl@0
   837
 * m = m >> 8;
sl@0
   838
 * 
sl@0
   839
 * Note the trick here - the top word is shifted by another nibble to
sl@0
   840
 * avoid it bumping into the middle word
sl@0
   841
 */
sl@0
   842
static __inline__ __m64
sl@0
   843
expand565 (__m64 pixel, int pos)
sl@0
   844
{
sl@0
   845
    __m64 p = pixel;
sl@0
   846
    __m64 t1, t2;
sl@0
   847
    
sl@0
   848
    /* move pixel to low 16 bit and zero the rest */
sl@0
   849
    p = shift (shift (p, (3 - pos) * 16), -48); 
sl@0
   850
    
sl@0
   851
    t1 = shift (p, 36 - 11);
sl@0
   852
    t2 = shift (p, 16 - 5);
sl@0
   853
    
sl@0
   854
    p = _mm_or_si64 (t1, p);
sl@0
   855
    p = _mm_or_si64 (t2, p);
sl@0
   856
    p = _mm_and_si64 (p, MC(565_rgb));
sl@0
   857
    
sl@0
   858
    pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
sl@0
   859
    return _mm_srli_pi16 (pixel, 8);
sl@0
   860
}
sl@0
   861
sl@0
   862
static __inline__ __m64
sl@0
   863
expand8888 (__m64 in, int pos)
sl@0
   864
{
sl@0
   865
    if (pos == 0)
sl@0
   866
	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
sl@0
   867
    else
sl@0
   868
	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
sl@0
   869
}
sl@0
   870
sl@0
   871
static __inline__ __m64
sl@0
   872
pack565 (__m64 pixel, __m64 target, int pos)
sl@0
   873
{
sl@0
   874
    __m64 p = pixel;
sl@0
   875
    __m64 t = target;
sl@0
   876
    __m64 r, g, b;
sl@0
   877
    
sl@0
   878
    r = _mm_and_si64 (p, MC(565_r));
sl@0
   879
    g = _mm_and_si64 (p, MC(565_g));
sl@0
   880
    b = _mm_and_si64 (p, MC(565_b));
sl@0
   881
    
sl@0
   882
    r = shift (r, - (32 - 8) + pos * 16);
sl@0
   883
    g = shift (g, - (16 - 3) + pos * 16);
sl@0
   884
    b = shift (b, - (0  + 3) + pos * 16);
sl@0
   885
    
sl@0
   886
    if (pos == 0)
sl@0
   887
	t = _mm_and_si64 (t, MC(mask_0));
sl@0
   888
    else if (pos == 1)
sl@0
   889
	t = _mm_and_si64 (t, MC(mask_1));
sl@0
   890
    else if (pos == 2)
sl@0
   891
	t = _mm_and_si64 (t, MC(mask_2));
sl@0
   892
    else if (pos == 3)
sl@0
   893
	t = _mm_and_si64 (t, MC(mask_3));
sl@0
   894
    
sl@0
   895
    p = _mm_or_si64 (r, t);
sl@0
   896
    p = _mm_or_si64 (g, p);
sl@0
   897
    
sl@0
   898
    return _mm_or_si64 (b, p);
sl@0
   899
}
sl@0
   900
sl@0
   901
#ifdef ENABLE_BROKEN_IMPLS
sl@0
   902
/* broken.  See Debian bug #340932 */
sl@0
   903
static void
sl@0
   904
fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
sl@0
   905
{
sl@0
   906
    __m64	vsrc, vsrca;
sl@0
   907
sl@0
   908
    vsrc = load8888 (*src);
sl@0
   909
    vsrca = expand_alpha (vsrc);
sl@0
   910
sl@0
   911
    while (w && (unsigned long)dst & 7)
sl@0
   912
    {
sl@0
   913
        *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
sl@0
   914
        
sl@0
   915
        w--;
sl@0
   916
        dst++;
sl@0
   917
    }
sl@0
   918
    
sl@0
   919
    while (w >= 2)
sl@0
   920
    {
sl@0
   921
        __m64 vdest;
sl@0
   922
        __m64 dest0, dest1;
sl@0
   923
        
sl@0
   924
        vdest = *(__m64 *)dst;
sl@0
   925
        
sl@0
   926
        dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
sl@0
   927
        dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
sl@0
   928
        
sl@0
   929
        *(__m64 *)dst = pack8888(dest0, dest1);
sl@0
   930
        
sl@0
   931
        dst += 2;
sl@0
   932
        w -= 2;
sl@0
   933
    }
sl@0
   934
    
sl@0
   935
    while (w)
sl@0
   936
    {
sl@0
   937
        *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
sl@0
   938
        
sl@0
   939
        w--;
sl@0
   940
        dst++;
sl@0
   941
    }
sl@0
   942
    
sl@0
   943
    _mm_empty();
sl@0
   944
}
sl@0
   945
OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
sl@0
   946
    OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
sl@0
   947
#endif
sl@0
   948
sl@0
   949
#if 0
sl@0
   950
void
sl@0
   951
fbCompositeSolid_nx0565mmx (CARD8	op,
sl@0
   952
			    PicturePtr pSrc,
sl@0
   953
			    PicturePtr pMask,
sl@0
   954
			    PicturePtr pDst,
sl@0
   955
			    INT16	xSrc,
sl@0
   956
			    INT16	ySrc,
sl@0
   957
			    INT16	xMask,
sl@0
   958
			    INT16	yMask,
sl@0
   959
			    INT16	xDst,
sl@0
   960
			    INT16	yDst,
sl@0
   961
			    CARD16	width,
sl@0
   962
			    CARD16	height)
sl@0
   963
{
sl@0
   964
    CARD32	src;
sl@0
   965
    CARD16	*dstLine, *dst;
sl@0
   966
    CARD16	w;
sl@0
   967
    FbStride	dstStride;
sl@0
   968
    __m64	vsrc, vsrca;
sl@0
   969
    
sl@0
   970
    CHECKPOINT();
sl@0
   971
    
sl@0
   972
    fbComposeGetSolid(pSrc, src, pDst->format);
sl@0
   973
    
sl@0
   974
    if (src >> 24 == 0)
sl@0
   975
	return;
sl@0
   976
    
sl@0
   977
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
sl@0
   978
    
sl@0
   979
    vsrc = load8888 (src);
sl@0
   980
    vsrca = expand_alpha (vsrc);
sl@0
   981
    
sl@0
   982
    while (height--)
sl@0
   983
    {
sl@0
   984
	dst = dstLine;
sl@0
   985
	dstLine += dstStride;
sl@0
   986
	w = width;
sl@0
   987
	
sl@0
   988
	CHECKPOINT();
sl@0
   989
	
sl@0
   990
	while (w && (unsigned long)dst & 7)
sl@0
   991
	{
sl@0
   992
	    ullong d = *dst;
sl@0
   993
	    __m64 vdest = expand565 ((__m64)d, 0);
sl@0
   994
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
sl@0
   995
	    *dst = (ullong)vdest;
sl@0
   996
	    
sl@0
   997
	    w--;
sl@0
   998
	    dst++;
sl@0
   999
	}
sl@0
  1000
	
sl@0
  1001
	while (w >= 4)
sl@0
  1002
	{
sl@0
  1003
	    __m64 vdest;
sl@0
  1004
	    
sl@0
  1005
	    vdest = *(__m64 *)dst;
sl@0
  1006
	    
sl@0
  1007
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
sl@0
  1008
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
sl@0
  1009
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
sl@0
  1010
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
sl@0
  1011
	    
sl@0
  1012
	    *(__m64 *)dst = vdest;
sl@0
  1013
	    
sl@0
  1014
	    dst += 4;
sl@0
  1015
	    w -= 4;
sl@0
  1016
	}
sl@0
  1017
	
sl@0
  1018
	CHECKPOINT();
sl@0
  1019
	
sl@0
  1020
	while (w)
sl@0
  1021
	{
sl@0
  1022
	    ullong d = *dst;
sl@0
  1023
	    __m64 vdest = expand565 ((__m64)d, 0);
sl@0
  1024
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
sl@0
  1025
	    *dst = (ullong)vdest;
sl@0
  1026
	    
sl@0
  1027
	    w--;
sl@0
  1028
	    dst++;
sl@0
  1029
	}
sl@0
  1030
    }
sl@0
  1031
    
sl@0
  1032
    _mm_empty();
sl@0
  1033
}
sl@0
  1034
#endif
sl@0
  1035
sl@0
  1036
#if 0
sl@0
  1037
static void
sl@0
  1038
fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
sl@0
  1039
{
sl@0
  1040
    CARD32	src, srca;
sl@0
  1041
    CARD32	*dstLine;
sl@0
  1042
    CARD32	*maskLine;
sl@0
  1043
    FbStride	dstStride, maskStride;
sl@0
  1044
    __m64	vsrc, vsrca;
sl@0
  1045
    
sl@0
  1046
    
sl@0
  1047
    while (twidth && (unsigned long)q & 7)
sl@0
  1048
    {
sl@0
  1049
        CARD32 m = *(CARD32 *)p;
sl@0
  1050
        
sl@0
  1051
        if (m)
sl@0
  1052
        {
sl@0
  1053
            __m64 vdest = load8888(*q);
sl@0
  1054
            vdest = in_over(vsrc, vsrca, load8888(m), vdest);
sl@0
  1055
            *q = (ullong)pack8888(vdest, _mm_setzero_si64());
sl@0
  1056
        }
sl@0
  1057
        
sl@0
  1058
        twidth--;
sl@0
  1059
        p++;
sl@0
  1060
        q++;
sl@0
  1061
    }
sl@0
  1062
    
sl@0
  1063
    while (twidth >= 2)
sl@0
  1064
    {
sl@0
  1065
        CARD32 m0, m1;
sl@0
  1066
        m0 = *p;
sl@0
  1067
        m1 = *(p + 1);
sl@0
  1068
        
sl@0
  1069
        if (m0 | m1)
sl@0
  1070
        {
sl@0
  1071
            __m64 dest0, dest1;
sl@0
  1072
            __m64 vdest = *(__m64 *)q;
sl@0
  1073
            
sl@0
  1074
            dest0 = in_over(vsrc, vsrca, load8888(m0),
sl@0
  1075
                            expand8888 (vdest, 0));
sl@0
  1076
            dest1 = in_over(vsrc, vsrca, load8888(m1),
sl@0
  1077
                            expand8888 (vdest, 1));
sl@0
  1078
            
sl@0
  1079
            *(__m64 *)q = pack8888(dest0, dest1);
sl@0
  1080
        }
sl@0
  1081
        
sl@0
  1082
        p += 2;
sl@0
  1083
        q += 2;
sl@0
  1084
        twidth -= 2;
sl@0
  1085
    }
sl@0
  1086
    
sl@0
  1087
    while (twidth)
sl@0
  1088
    {
sl@0
  1089
        CARD32 m = *(CARD32 *)p;
sl@0
  1090
        
sl@0
  1091
        if (m)
sl@0
  1092
        {
sl@0
  1093
            __m64 vdest = load8888(*q);
sl@0
  1094
            vdest = in_over(vsrc, vsrca, load8888(m), vdest);
sl@0
  1095
            *q = (ullong)pack8888(vdest, _mm_setzero_si64());
sl@0
  1096
        }
sl@0
  1097
        
sl@0
  1098
        twidth--;
sl@0
  1099
        p++;
sl@0
  1100
        q++;
sl@0
  1101
    }
sl@0
  1102
    
sl@0
  1103
    _mm_empty();
sl@0
  1104
}
sl@0
  1105
#endif
sl@0
  1106
sl@0
  1107
#if 0
sl@0
  1108
static void
sl@0
  1109
fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
sl@0
  1110
    int width)
sl@0
  1111
{
sl@0
  1112
sl@0
  1113
    mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
sl@0
  1114
    vmask = load8888 (mask);
sl@0
  1115
    srca = MC(4x00ff);
sl@0
  1116
    
sl@0
  1117
    while (height--)
sl@0
  1118
    {
sl@0
  1119
	dst = dstLine;
sl@0
  1120
	dstLine += dstStride;
sl@0
  1121
	src = srcLine;
sl@0
  1122
	srcLine += srcStride;
sl@0
  1123
	w = width;
sl@0
  1124
sl@0
  1125
	while (w && (unsigned long)dst & 7)
sl@0
  1126
	{
sl@0
  1127
	    __m64 s = load8888 (*src);
sl@0
  1128
	    __m64 d = load8888 (*dst);
sl@0
  1129
	    
sl@0
  1130
	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
sl@0
  1131
	    
sl@0
  1132
	    w--;
sl@0
  1133
	    dst++;
sl@0
  1134
	    src++;
sl@0
  1135
	}
sl@0
  1136
sl@0
  1137
	while (w >= 16)
sl@0
  1138
	{
sl@0
  1139
	    __m64 vd0 = *(__m64 *)(dst + 0);
sl@0
  1140
	    __m64 vd1 = *(__m64 *)(dst + 2);
sl@0
  1141
	    __m64 vd2 = *(__m64 *)(dst + 4);
sl@0
  1142
	    __m64 vd3 = *(__m64 *)(dst + 6);
sl@0
  1143
	    __m64 vd4 = *(__m64 *)(dst + 8);
sl@0
  1144
	    __m64 vd5 = *(__m64 *)(dst + 10);
sl@0
  1145
	    __m64 vd6 = *(__m64 *)(dst + 12);
sl@0
  1146
	    __m64 vd7 = *(__m64 *)(dst + 14);
sl@0
  1147
sl@0
  1148
	    __m64 vs0 = *(__m64 *)(src + 0);
sl@0
  1149
	    __m64 vs1 = *(__m64 *)(src + 2);
sl@0
  1150
	    __m64 vs2 = *(__m64 *)(src + 4);
sl@0
  1151
	    __m64 vs3 = *(__m64 *)(src + 6);
sl@0
  1152
	    __m64 vs4 = *(__m64 *)(src + 8);
sl@0
  1153
	    __m64 vs5 = *(__m64 *)(src + 10);
sl@0
  1154
	    __m64 vs6 = *(__m64 *)(src + 12);
sl@0
  1155
	    __m64 vs7 = *(__m64 *)(src + 14);
sl@0
  1156
sl@0
  1157
	    vd0 = (__m64)pack8888 (
sl@0
  1158
		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
sl@0
  1159
		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
sl@0
  1160
	
sl@0
  1161
	    vd1 = (__m64)pack8888 (
sl@0
  1162
		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
sl@0
  1163
		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
sl@0
  1164
	
sl@0
  1165
	    vd2 = (__m64)pack8888 (
sl@0
  1166
		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
sl@0
  1167
		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
sl@0
  1168
	
sl@0
  1169
	    vd3 = (__m64)pack8888 (
sl@0
  1170
		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
sl@0
  1171
		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
sl@0
  1172
	
sl@0
  1173
	    vd4 = (__m64)pack8888 (
sl@0
  1174
		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
sl@0
  1175
		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
sl@0
  1176
	
sl@0
  1177
	    vd5 = (__m64)pack8888 (
sl@0
  1178
		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
sl@0
  1179
		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
sl@0
  1180
	
sl@0
  1181
	    vd6 = (__m64)pack8888 (
sl@0
  1182
		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
sl@0
  1183
		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
sl@0
  1184
	
sl@0
  1185
	    vd7 = (__m64)pack8888 (
sl@0
  1186
		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
sl@0
  1187
		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
sl@0
  1188
sl@0
  1189
    	    *(__m64 *)(dst + 0) = vd0;
sl@0
  1190
	    *(__m64 *)(dst + 2) = vd1;
sl@0
  1191
	    *(__m64 *)(dst + 4) = vd2;
sl@0
  1192
	    *(__m64 *)(dst + 6) = vd3;
sl@0
  1193
	    *(__m64 *)(dst + 8) = vd4;
sl@0
  1194
	    *(__m64 *)(dst + 10) = vd5;
sl@0
  1195
	    *(__m64 *)(dst + 12) = vd6;
sl@0
  1196
	    *(__m64 *)(dst + 14) = vd7;
sl@0
  1197
	
sl@0
  1198
	    w -= 16;
sl@0
  1199
	    dst += 16;
sl@0
  1200
	    src += 16;
sl@0
  1201
	}
sl@0
  1202
	
sl@0
  1203
	while (w)
sl@0
  1204
	{
sl@0
  1205
	    __m64 s = load8888 (*src);
sl@0
  1206
	    __m64 d = load8888 (*dst);
sl@0
  1207
	    
sl@0
  1208
	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
sl@0
  1209
	    
sl@0
  1210
	    w--;
sl@0
  1211
	    dst++;
sl@0
  1212
	    src++;
sl@0
  1213
	}
sl@0
  1214
    }
sl@0
  1215
sl@0
  1216
    _mm_empty(); 
sl@0
  1217
}
sl@0
  1218
sl@0
  1219
void
sl@0
  1220
fbCompositeSrc_8888x8888mmx (CARD8	op,
sl@0
  1221
			     PicturePtr pSrc,
sl@0
  1222
			     PicturePtr pMask,
sl@0
  1223
			     PicturePtr pDst,
sl@0
  1224
			     INT16	xSrc,
sl@0
  1225
			     INT16	ySrc,
sl@0
  1226
			     INT16      xMask,
sl@0
  1227
			     INT16      yMask,
sl@0
  1228
			     INT16      xDst,
sl@0
  1229
			     INT16      yDst,
sl@0
  1230
			     CARD16     width,
sl@0
  1231
			     CARD16     height)
sl@0
  1232
{
sl@0
  1233
    CARD32	*dstLine, *dst;
sl@0
  1234
    CARD32	*srcLine, *src;
sl@0
  1235
    FbStride	dstStride, srcStride;
sl@0
  1236
    CARD16	w;
sl@0
  1237
    __m64  srca;
sl@0
  1238
    
sl@0
  1239
    CHECKPOINT();
sl@0
  1240
    
sl@0
  1241
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
sl@0
  1242
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
sl@0
  1243
sl@0
  1244
    srca = MC (4x00ff);
sl@0
  1245
    
sl@0
  1246
    while (height--)
sl@0
  1247
    {
sl@0
  1248
	dst = dstLine;
sl@0
  1249
	dstLine += dstStride;
sl@0
  1250
	src = srcLine;
sl@0
  1251
	srcLine += srcStride;
sl@0
  1252
	w = width;
sl@0
  1253
sl@0
  1254
	while (w && (unsigned long)dst & 7)
sl@0
  1255
	{
sl@0
  1256
	    __m64 s = load8888 (*src);
sl@0
  1257
	    __m64 d = load8888 (*dst);
sl@0
  1258
	    
sl@0
  1259
	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
sl@0
  1260
	    
sl@0
  1261
	    w--;
sl@0
  1262
	    dst++;
sl@0
  1263
	    src++;
sl@0
  1264
	}
sl@0
  1265
sl@0
  1266
	while (w >= 2)
sl@0
  1267
	{
sl@0
  1268
	    __m64 vd = *(__m64 *)(dst + 0);
sl@0
  1269
	    __m64 vs = *(__m64 *)(src + 0);
sl@0
  1270
	    __m64 vs0 = expand8888 (vs, 0);
sl@0
  1271
	    __m64 vs1 = expand8888 (vs, 1);
sl@0
  1272
sl@0
  1273
	    *(__m64 *)dst = (__m64)pack8888 (
sl@0
  1274
		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
sl@0
  1275
		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
sl@0
  1276
	    
sl@0
  1277
	    w -= 2;
sl@0
  1278
	    dst += 2;
sl@0
  1279
	    src += 2;
sl@0
  1280
	}
sl@0
  1281
	
sl@0
  1282
	while (w)
sl@0
  1283
	{
sl@0
  1284
	    __m64 s = load8888 (*src);
sl@0
  1285
	    __m64 d = load8888 (*dst);
sl@0
  1286
	    
sl@0
  1287
	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
sl@0
  1288
				     (__m64)_mm_setzero_si64());
sl@0
  1289
	    
sl@0
  1290
	    w--;
sl@0
  1291
	    dst++;
sl@0
  1292
	    src++;
sl@0
  1293
	}
sl@0
  1294
    }
sl@0
  1295
sl@0
  1296
    _mm_empty(); 
sl@0
  1297
}
sl@0
  1298
sl@0
  1299
void
sl@0
  1300
fbCompositeSolidMask_nx8x8888mmx (CARD8      op,
sl@0
  1301
				  PicturePtr pSrc,
sl@0
  1302
				  PicturePtr pMask,
sl@0
  1303
				  PicturePtr pDst,
sl@0
  1304
				  INT16      xSrc,
sl@0
  1305
				  INT16      ySrc,
sl@0
  1306
				  INT16      xMask,
sl@0
  1307
				  INT16      yMask,
sl@0
  1308
				  INT16      xDst,
sl@0
  1309
				  INT16      yDst,
sl@0
  1310
				  CARD16     width,
sl@0
  1311
				  CARD16     height)
sl@0
  1312
{
sl@0
  1313
    CARD32	src, srca;
sl@0
  1314
    CARD32	*dstLine, *dst;
sl@0
  1315
    CARD8	*maskLine, *mask;
sl@0
  1316
    FbStride	dstStride, maskStride;
sl@0
  1317
    CARD16	w;
sl@0
  1318
    __m64	vsrc, vsrca;
sl@0
  1319
    ullong	srcsrc;
sl@0
  1320
    
sl@0
  1321
    CHECKPOINT();
sl@0
  1322
    
sl@0
  1323
    fbComposeGetSolid(pSrc, src, pDst->format);
sl@0
  1324
    
sl@0
  1325
    srca = src >> 24;
sl@0
  1326
    if (srca == 0)
sl@0
  1327
	return;
sl@0
  1328
    
sl@0
  1329
    srcsrc = (unsigned long long)src << 32 | src;
sl@0
  1330
    
sl@0
  1331
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
sl@0
  1332
    fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
sl@0
  1333
    
sl@0
  1334
    vsrc = load8888 (src);
sl@0
  1335
    vsrca = expand_alpha (vsrc);
sl@0
  1336
    
sl@0
  1337
    while (height--)
sl@0
  1338
    {
sl@0
  1339
	dst = dstLine;
sl@0
  1340
	dstLine += dstStride;
sl@0
  1341
	mask = maskLine;
sl@0
  1342
	maskLine += maskStride;
sl@0
  1343
	w = width;
sl@0
  1344
	
sl@0
  1345
	CHECKPOINT();
sl@0
  1346
	
sl@0
  1347
	while (w && (unsigned long)dst & 7)
sl@0
  1348
	{
sl@0
  1349
	    ullong m = *mask;
sl@0
  1350
	    
sl@0
  1351
	    if (m)
sl@0
  1352
	    {
sl@0
  1353
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
sl@0
  1354
		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
sl@0
  1355
	    }
sl@0
  1356
	    
sl@0
  1357
	    w--;
sl@0
  1358
	    mask++;
sl@0
  1359
	    dst++;
sl@0
  1360
	}
sl@0
  1361
	
sl@0
  1362
	CHECKPOINT();
sl@0
  1363
	
sl@0
  1364
	while (w >= 2)
sl@0
  1365
	{
sl@0
  1366
	    ullong m0, m1;
sl@0
  1367
	    m0 = *mask;
sl@0
  1368
	    m1 = *(mask + 1);
sl@0
  1369
	    
sl@0
  1370
	    if (srca == 0xff && (m0 & m1) == 0xff)
sl@0
  1371
	    {
sl@0
  1372
		*(unsigned long long *)dst = srcsrc;
sl@0
  1373
	    }
sl@0
  1374
	    else if (m0 | m1)
sl@0
  1375
	    {
sl@0
  1376
		__m64 vdest;
sl@0
  1377
		__m64 dest0, dest1;
sl@0
  1378
		
sl@0
  1379
		vdest = *(__m64 *)dst;
sl@0
  1380
		
sl@0
  1381
		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
sl@0
  1382
		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
sl@0
  1383
		
sl@0
  1384
		*(__m64 *)dst = pack8888(dest0, dest1);
sl@0
  1385
	    }
sl@0
  1386
	    
sl@0
  1387
	    mask += 2;
sl@0
  1388
	    dst += 2;
sl@0
  1389
	    w -= 2;
sl@0
  1390
	}
sl@0
  1391
	
sl@0
  1392
	CHECKPOINT();
sl@0
  1393
	
sl@0
  1394
	while (w)
sl@0
  1395
	{
sl@0
  1396
	    ullong m = *mask;
sl@0
  1397
	    
sl@0
  1398
	    if (m)
sl@0
  1399
	    {
sl@0
  1400
		__m64 vdest = load8888(*dst);
sl@0
  1401
		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
sl@0
  1402
		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
sl@0
  1403
	    }
sl@0
  1404
	    
sl@0
  1405
	    w--;
sl@0
  1406
	    mask++;
sl@0
  1407
	    dst++;
sl@0
  1408
	}
sl@0
  1409
    }
sl@0
  1410
    
sl@0
  1411
    _mm_empty();
sl@0
  1412
}
sl@0
  1413
sl@0
  1414
sl@0
  1415
void
sl@0
  1416
fbCompositeSolidMask_nx8x0565mmx (CARD8      op,
sl@0
  1417
				  PicturePtr pSrc,
sl@0
  1418
				  PicturePtr pMask,
sl@0
  1419
				  PicturePtr pDst,
sl@0
  1420
				  INT16      xSrc,
sl@0
  1421
				  INT16      ySrc,
sl@0
  1422
				  INT16      xMask,
sl@0
  1423
				  INT16      yMask,
sl@0
  1424
				  INT16      xDst,
sl@0
  1425
				  INT16      yDst,
sl@0
  1426
				  CARD16     width,
sl@0
  1427
				  CARD16     height)
sl@0
  1428
{
sl@0
  1429
    CARD32	src, srca;
sl@0
  1430
    CARD16	*dstLine, *dst;
sl@0
  1431
    CARD8	*maskLine, *mask;
sl@0
  1432
    FbStride	dstStride, maskStride;
sl@0
  1433
    CARD16	w;
sl@0
  1434
    __m64	vsrc, vsrca;
sl@0
  1435
    unsigned long long srcsrcsrcsrc, src16;
sl@0
  1436
    
sl@0
  1437
    CHECKPOINT();
sl@0
  1438
    
sl@0
  1439
    fbComposeGetSolid(pSrc, src, pDst->format);
sl@0
  1440
    
sl@0
  1441
    srca = src >> 24;
sl@0
  1442
    if (srca == 0)
sl@0
  1443
	return;
sl@0
  1444
    
sl@0
  1445
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
sl@0
  1446
    fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
sl@0
  1447
    
sl@0
  1448
    vsrc = load8888 (src);
sl@0
  1449
    vsrca = expand_alpha (vsrc);
sl@0
  1450
    
sl@0
  1451
    src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
sl@0
  1452
    
sl@0
  1453
    srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
sl@0
  1454
	(ullong)src16 << 16 | (ullong)src16;
sl@0
  1455
    
sl@0
  1456
    while (height--)
sl@0
  1457
    {
sl@0
  1458
	dst = dstLine;
sl@0
  1459
	dstLine += dstStride;
sl@0
  1460
	mask = maskLine;
sl@0
  1461
	maskLine += maskStride;
sl@0
  1462
	w = width;
sl@0
  1463
	
sl@0
  1464
	CHECKPOINT();
sl@0
  1465
	
sl@0
  1466
	while (w && (unsigned long)dst & 7)
sl@0
  1467
	{
sl@0
  1468
	    ullong m = *mask;
sl@0
  1469
	    
sl@0
  1470
	    if (m)
sl@0
  1471
	    {
sl@0
  1472
		ullong d = *dst;
sl@0
  1473
		__m64 vd = (__m64)d;
sl@0
  1474
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
sl@0
  1475
		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
sl@0
  1476
	    }
sl@0
  1477
	    
sl@0
  1478
	    w--;
sl@0
  1479
	    mask++;
sl@0
  1480
	    dst++;
sl@0
  1481
	}
sl@0
  1482
	
sl@0
  1483
	CHECKPOINT();
sl@0
  1484
	
sl@0
  1485
	while (w >= 4)
sl@0
  1486
	{
sl@0
  1487
	    ullong m0, m1, m2, m3;
sl@0
  1488
	    m0 = *mask;
sl@0
  1489
	    m1 = *(mask + 1);
sl@0
  1490
	    m2 = *(mask + 2);
sl@0
  1491
	    m3 = *(mask + 3);
sl@0
  1492
	    
sl@0
  1493
	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
sl@0
  1494
	    {
sl@0
  1495
		*(unsigned long long *)dst = srcsrcsrcsrc;
sl@0
  1496
	    }
sl@0
  1497
	    else if (m0 | m1 | m2 | m3)
sl@0
  1498
	    {
sl@0
  1499
		__m64 vdest;
sl@0
  1500
		__m64 vm0, vm1, vm2, vm3;
sl@0
  1501
		
sl@0
  1502
		vdest = *(__m64 *)dst;
sl@0
  1503
		
sl@0
  1504
		vm0 = (__m64)m0;
sl@0
  1505
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
sl@0
  1506
		vm1 = (__m64)m1;
sl@0
  1507
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
sl@0
  1508
		vm2 = (__m64)m2;
sl@0
  1509
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
sl@0
  1510
		vm3 = (__m64)m3;
sl@0
  1511
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
sl@0
  1512
		
sl@0
  1513
		*(__m64 *)dst = vdest;
sl@0
  1514
	    }
sl@0
  1515
	    
sl@0
  1516
	    w -= 4;
sl@0
  1517
	    mask += 4;
sl@0
  1518
	    dst += 4;
sl@0
  1519
	}
sl@0
  1520
	
sl@0
  1521
	CHECKPOINT();
sl@0
  1522
	
sl@0
  1523
	while (w)
sl@0
  1524
	{
sl@0
  1525
	    ullong m = *mask;
sl@0
  1526
	    
sl@0
  1527
	    if (m)
sl@0
  1528
	    {
sl@0
  1529
		ullong d = *dst;
sl@0
  1530
		__m64 vd = (__m64)d;
sl@0
  1531
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
sl@0
  1532
		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
sl@0
  1533
	    }
sl@0
  1534
	    
sl@0
  1535
	    w--;
sl@0
  1536
	    mask++;
sl@0
  1537
	    dst++;
sl@0
  1538
	}
sl@0
  1539
    }
sl@0
  1540
    
sl@0
  1541
    _mm_empty();
sl@0
  1542
}
sl@0
  1543
sl@0
  1544
void
sl@0
  1545
fbCompositeSrc_8888RevNPx0565mmx (CARD8      op,
sl@0
  1546
				  PicturePtr pSrc,
sl@0
  1547
				  PicturePtr pMask,
sl@0
  1548
				  PicturePtr pDst,
sl@0
  1549
				  INT16      xSrc,
sl@0
  1550
				  INT16      ySrc,
sl@0
  1551
				  INT16      xMask,
sl@0
  1552
				  INT16      yMask,
sl@0
  1553
				  INT16      xDst,
sl@0
  1554
				  INT16      yDst,
sl@0
  1555
				  CARD16     width,
sl@0
  1556
				  CARD16     height)
sl@0
  1557
{
sl@0
  1558
    CARD16	*dstLine, *dst;
sl@0
  1559
    CARD32	*srcLine, *src;
sl@0
  1560
    FbStride	dstStride, srcStride;
sl@0
  1561
    CARD16	w;
sl@0
  1562
    
sl@0
  1563
    CHECKPOINT();
sl@0
  1564
    
sl@0
  1565
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
sl@0
  1566
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
sl@0
  1567
    
sl@0
  1568
    assert (pSrc->pDrawable == pMask->pDrawable);
sl@0
  1569
    
sl@0
  1570
    while (height--)
sl@0
  1571
    {
sl@0
  1572
	dst = dstLine;
sl@0
  1573
	dstLine += dstStride;
sl@0
  1574
	src = srcLine;
sl@0
  1575
	srcLine += srcStride;
sl@0
  1576
	w = width;
sl@0
  1577
	
sl@0
  1578
	CHECKPOINT();
sl@0
  1579
	
sl@0
  1580
	while (w && (unsigned long)dst & 7)
sl@0
  1581
	{
sl@0
  1582
	    __m64 vsrc = load8888 (*src);
sl@0
  1583
	    ullong d = *dst;
sl@0
  1584
	    __m64 vdest = expand565 ((__m64)d, 0);
sl@0
  1585
	    
sl@0
  1586
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
sl@0
  1587
	    
sl@0
  1588
	    *dst = (ullong)vdest;
sl@0
  1589
	    
sl@0
  1590
	    w--;
sl@0
  1591
	    dst++;
sl@0
  1592
	    src++;
sl@0
  1593
	}
sl@0
  1594
	
sl@0
  1595
	CHECKPOINT();
sl@0
  1596
	
sl@0
  1597
	while (w >= 4)
sl@0
  1598
	{
sl@0
  1599
	    CARD32 s0, s1, s2, s3;
sl@0
  1600
	    unsigned char a0, a1, a2, a3;
sl@0
  1601
	    
sl@0
  1602
	    s0 = *src;
sl@0
  1603
	    s1 = *(src + 1);
sl@0
  1604
	    s2 = *(src + 2);
sl@0
  1605
	    s3 = *(src + 3);
sl@0
  1606
	    
sl@0
  1607
	    a0 = (s0 >> 24);
sl@0
  1608
	    a1 = (s1 >> 24);
sl@0
  1609
	    a2 = (s2 >> 24);
sl@0
  1610
	    a3 = (s3 >> 24);
sl@0
  1611
	    
sl@0
  1612
	    if ((a0 & a1 & a2 & a3) == 0xFF)
sl@0
  1613
	    {
sl@0
  1614
		__m64 vdest;
sl@0
  1615
		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
sl@0
  1616
		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
sl@0
  1617
		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
sl@0
  1618
		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
sl@0
  1619
		
sl@0
  1620
		*(__m64 *)dst = vdest;
sl@0
  1621
	    }
sl@0
  1622
	    else if (a0 | a1 | a2 | a3)
sl@0
  1623
	    {
sl@0
  1624
		__m64 vdest = *(__m64 *)dst;
sl@0
  1625
		
sl@0
  1626
		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
sl@0
  1627
	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
sl@0
  1628
		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
sl@0
  1629
		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
sl@0
  1630
		
sl@0
  1631
		*(__m64 *)dst = vdest;
sl@0
  1632
	    }
sl@0
  1633
	    
sl@0
  1634
	    w -= 4;
sl@0
  1635
	    dst += 4;
sl@0
  1636
	    src += 4;
sl@0
  1637
	}
sl@0
  1638
	
sl@0
  1639
	CHECKPOINT();
sl@0
  1640
	
sl@0
  1641
	while (w)
sl@0
  1642
	{
sl@0
  1643
	    __m64 vsrc = load8888 (*src);
sl@0
  1644
	    ullong d = *dst;
sl@0
  1645
	    __m64 vdest = expand565 ((__m64)d, 0);
sl@0
  1646
	    
sl@0
  1647
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
sl@0
  1648
	    
sl@0
  1649
	    *dst = (ullong)vdest;
sl@0
  1650
	    
sl@0
  1651
	    w--;
sl@0
  1652
	    dst++;
sl@0
  1653
	    src++;
sl@0
  1654
	}
sl@0
  1655
    }
sl@0
  1656
    
sl@0
  1657
    _mm_empty();
sl@0
  1658
}
sl@0
  1659
sl@0
  1660
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
sl@0
  1661
sl@0
  1662
void
sl@0
  1663
fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
sl@0
  1664
				  PicturePtr pSrc,
sl@0
  1665
				  PicturePtr pMask,
sl@0
  1666
				  PicturePtr pDst,
sl@0
  1667
				  INT16      xSrc,
sl@0
  1668
				  INT16      ySrc,
sl@0
  1669
				  INT16      xMask,
sl@0
  1670
				  INT16      yMask,
sl@0
  1671
				  INT16      xDst,
sl@0
  1672
				  INT16      yDst,
sl@0
  1673
				  CARD16     width,
sl@0
  1674
				  CARD16     height)
sl@0
  1675
{
sl@0
  1676
    CARD32	*dstLine, *dst;
sl@0
  1677
    CARD32	*srcLine, *src;
sl@0
  1678
    FbStride	dstStride, srcStride;
sl@0
  1679
    CARD16	w;
sl@0
  1680
    
sl@0
  1681
    CHECKPOINT();
sl@0
  1682
    
sl@0
  1683
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
sl@0
  1684
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
sl@0
  1685
    
sl@0
  1686
    assert (pSrc->pDrawable == pMask->pDrawable);
sl@0
  1687
    
sl@0
  1688
    while (height--)
sl@0
  1689
    {
sl@0
  1690
	dst = dstLine;
sl@0
  1691
	dstLine += dstStride;
sl@0
  1692
	src = srcLine;
sl@0
  1693
	srcLine += srcStride;
sl@0
  1694
	w = width;
sl@0
  1695
	
sl@0
  1696
	while (w && (unsigned long)dst & 7)
sl@0
  1697
	{
sl@0
  1698
	    __m64 s = load8888 (*src);
sl@0
  1699
	    __m64 d = load8888 (*dst);
sl@0
  1700
	    
sl@0
  1701
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
sl@0
  1702
	    
sl@0
  1703
	    w--;
sl@0
  1704
	    dst++;
sl@0
  1705
	    src++;
sl@0
  1706
	}
sl@0
  1707
	
sl@0
  1708
	while (w >= 2)
sl@0
  1709
	{
sl@0
  1710
	    ullong s0, s1;
sl@0
  1711
	    unsigned char a0, a1;
sl@0
  1712
	    __m64 d0, d1;
sl@0
  1713
	    
sl@0
  1714
	    s0 = *src;
sl@0
  1715
	    s1 = *(src + 1);
sl@0
  1716
	    
sl@0
  1717
	    a0 = (s0 >> 24);
sl@0
  1718
	    a1 = (s1 >> 24);
sl@0
  1719
	    
sl@0
  1720
	    if ((a0 & a1) == 0xFF)
sl@0
  1721
	    {
sl@0
  1722
		d0 = invert_colors(load8888(s0));
sl@0
  1723
		d1 = invert_colors(load8888(s1));
sl@0
  1724
		
sl@0
  1725
		*(__m64 *)dst = pack8888 (d0, d1);
sl@0
  1726
	    }
sl@0
  1727
	    else if (a0 | a1)
sl@0
  1728
	    {
sl@0
  1729
		__m64 vdest = *(__m64 *)dst;
sl@0
  1730
		
sl@0
  1731
		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
sl@0
  1732
		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
sl@0
  1733
		
sl@0
  1734
		*(__m64 *)dst = pack8888 (d0, d1);
sl@0
  1735
	    }
sl@0
  1736
	    
sl@0
  1737
	    w -= 2;
sl@0
  1738
	    dst += 2;
sl@0
  1739
	    src += 2;
sl@0
  1740
	}
sl@0
  1741
	
sl@0
  1742
	while (w)
sl@0
  1743
	{
sl@0
  1744
	    __m64 s = load8888 (*src);
sl@0
  1745
	    __m64 d = load8888 (*dst);
sl@0
  1746
	    
sl@0
  1747
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
sl@0
  1748
	    
sl@0
  1749
	    w--;
sl@0
  1750
	    dst++;
sl@0
  1751
	    src++;
sl@0
  1752
	}
sl@0
  1753
    }
sl@0
  1754
    
sl@0
  1755
    _mm_empty();
sl@0
  1756
}
sl@0
  1757
sl@0
  1758
void
sl@0
  1759
fbCompositeSolidMask_nx8888x0565Cmmx (CARD8      op,
sl@0
  1760
				      PicturePtr pSrc,
sl@0
  1761
				      PicturePtr pMask,
sl@0
  1762
				      PicturePtr pDst,
sl@0
  1763
				      INT16      xSrc,
sl@0
  1764
				      INT16      ySrc,
sl@0
  1765
				      INT16      xMask,
sl@0
  1766
				      INT16      yMask,
sl@0
  1767
				      INT16      xDst,
sl@0
  1768
				      INT16      yDst,
sl@0
  1769
				      CARD16     width,
sl@0
  1770
				      CARD16     height)
sl@0
  1771
{
sl@0
  1772
    CARD32	src, srca;
sl@0
  1773
    CARD16	*dstLine;
sl@0
  1774
    CARD32	*maskLine;
sl@0
  1775
    FbStride	dstStride, maskStride;
sl@0
  1776
    __m64  vsrc, vsrca;
sl@0
  1777
    
sl@0
  1778
    CHECKPOINT();
sl@0
  1779
    
sl@0
  1780
    fbComposeGetSolid(pSrc, src, pDst->format);
sl@0
  1781
    
sl@0
  1782
    srca = src >> 24;
sl@0
  1783
    if (srca == 0)
sl@0
  1784
	return;
sl@0
  1785
    
sl@0
  1786
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
sl@0
  1787
    fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
sl@0
  1788
    
sl@0
  1789
    vsrc = load8888 (src);
sl@0
  1790
    vsrca = expand_alpha (vsrc);
sl@0
  1791
    
sl@0
  1792
    while (height--)
sl@0
  1793
    {
sl@0
  1794
	int twidth = width;
sl@0
  1795
	CARD32 *p = (CARD32 *)maskLine;
sl@0
  1796
	CARD16 *q = (CARD16 *)dstLine;
sl@0
  1797
	
sl@0
  1798
	while (twidth && ((unsigned long)q & 7))
sl@0
  1799
	{
sl@0
  1800
	    CARD32 m = *(CARD32 *)p;
sl@0
  1801
	    
sl@0
  1802
	    if (m)
sl@0
  1803
	    {
sl@0
  1804
		ullong d = *q;
sl@0
  1805
		__m64 vdest = expand565 ((__m64)d, 0);
sl@0
  1806
		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
sl@0
  1807
		*q = (ullong)vdest;
sl@0
  1808
	    }
sl@0
  1809
	    
sl@0
  1810
	    twidth--;
sl@0
  1811
	    p++;
sl@0
  1812
	    q++;
sl@0
  1813
	}
sl@0
  1814
	
sl@0
  1815
	while (twidth >= 4)
sl@0
  1816
	{
sl@0
  1817
	    CARD32 m0, m1, m2, m3;
sl@0
  1818
	    
sl@0
  1819
	    m0 = *p;
sl@0
  1820
	    m1 = *(p + 1);
sl@0
  1821
	    m2 = *(p + 2);
sl@0
  1822
	    m3 = *(p + 3);
sl@0
  1823
	    
sl@0
  1824
	    if ((m0 | m1 | m2 | m3))
sl@0
  1825
	    {
sl@0
  1826
		__m64 vdest = *(__m64 *)q;
sl@0
  1827
		
sl@0
  1828
		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
sl@0
  1829
		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
sl@0
  1830
		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
sl@0
  1831
		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
sl@0
  1832
		
sl@0
  1833
		*(__m64 *)q = vdest;
sl@0
  1834
	    }
sl@0
  1835
	    twidth -= 4;
sl@0
  1836
	    p += 4;
sl@0
  1837
	    q += 4;
sl@0
  1838
	}
sl@0
  1839
	
sl@0
  1840
	while (twidth)
sl@0
  1841
	{
sl@0
  1842
	    CARD32 m;
sl@0
  1843
	    
sl@0
  1844
	    m = *(CARD32 *)p;
sl@0
  1845
	    if (m)
sl@0
  1846
	    {
sl@0
  1847
		ullong d = *q;
sl@0
  1848
		__m64 vdest = expand565((__m64)d, 0);
sl@0
  1849
		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
sl@0
  1850
		*q = (ullong)vdest;
sl@0
  1851
	    }
sl@0
  1852
	    
sl@0
  1853
	    twidth--;
sl@0
  1854
	    p++;
sl@0
  1855
	    q++;
sl@0
  1856
	}
sl@0
  1857
	
sl@0
  1858
	maskLine += maskStride;
sl@0
  1859
	dstLine += dstStride;
sl@0
  1860
    }
sl@0
  1861
    
sl@0
  1862
    _mm_empty ();
sl@0
  1863
}
sl@0
  1864
#endif
sl@0
  1865
sl@0
  1866
static void
sl@0
  1867
fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
sl@0
  1868
{
sl@0
  1869
    int s;
sl@0
  1870
    int d;
sl@0
  1871
    int t;
sl@0
  1872
sl@0
  1873
    while (w && (unsigned long)dst & 7)
sl@0
  1874
    {
sl@0
  1875
        s = *src;
sl@0
  1876
        d = *dst;
sl@0
  1877
        t = d + s;
sl@0
  1878
        s = t | (0 - (t >> 8));
sl@0
  1879
        *dst = s;
sl@0
  1880
        
sl@0
  1881
        dst++;
sl@0
  1882
        src++;
sl@0
  1883
        w--;
sl@0
  1884
    }
sl@0
  1885
    
sl@0
  1886
    while (w >= 8)
sl@0
  1887
    {
sl@0
  1888
        *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
sl@0
  1889
        dst += 8;
sl@0
  1890
        src += 8;
sl@0
  1891
        w -= 8;
sl@0
  1892
    }
sl@0
  1893
    
sl@0
  1894
    while (w)
sl@0
  1895
    {
sl@0
  1896
        s = *src;
sl@0
  1897
        d = *dst;
sl@0
  1898
        t = d + s;
sl@0
  1899
        s = t | (0 - (t >> 8));
sl@0
  1900
        *dst = s;
sl@0
  1901
        
sl@0
  1902
        dst++;
sl@0
  1903
        src++;
sl@0
  1904
        w--;
sl@0
  1905
    }
sl@0
  1906
sl@0
  1907
    _mm_empty();
sl@0
  1908
}
sl@0
  1909
OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
sl@0
  1910
sl@0
  1911
static void
sl@0
  1912
fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
sl@0
  1913
{
sl@0
  1914
    while (w && (unsigned long)dst & 7)
sl@0
  1915
    {
sl@0
  1916
        *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
sl@0
  1917
                                             _mm_cvtsi32_si64(*dst)));
sl@0
  1918
        dst++;
sl@0
  1919
        src++;
sl@0
  1920
        w--;
sl@0
  1921
    }
sl@0
  1922
    
sl@0
  1923
    while (w >= 2)
sl@0
  1924
    {
sl@0
  1925
        *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
sl@0
  1926
        dst += 2;
sl@0
  1927
        src += 2;
sl@0
  1928
        w -= 2;
sl@0
  1929
    }
sl@0
  1930
    
sl@0
  1931
    if (w)
sl@0
  1932
    {
sl@0
  1933
        *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
sl@0
  1934
                                             _mm_cvtsi32_si64(*dst)));
sl@0
  1935
        
sl@0
  1936
    }
sl@0
  1937
    
sl@0
  1938
    _mm_empty();
sl@0
  1939
}
sl@0
  1940
OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
sl@0
  1941
sl@0
  1942
#if 0
sl@0
  1943
#define GetStart(drw,x,y,type,stride,line,bpp) {\
sl@0
  1944
    FbBits	*__bits__;									\
sl@0
  1945
    FbStride	__stride__;									\
sl@0
  1946
    int		__xoff__,__yoff__;								\
sl@0
  1947
												\
sl@0
  1948
    fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__);				\
sl@0
  1949
    (stride) = __stride__ * sizeof (FbBits) / sizeof (type);					\
sl@0
  1950
    (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__);		\
sl@0
  1951
}
sl@0
  1952
sl@0
  1953
Bool
sl@0
  1954
fbSolidFillmmx (DrawablePtr	pDraw,
sl@0
  1955
		int		x,
sl@0
  1956
		int		y,
sl@0
  1957
		int		width,
sl@0
  1958
		int		height,
sl@0
  1959
		FbBits		xor)
sl@0
  1960
{ 
sl@0
  1961
    FbStride	stride;
sl@0
  1962
    int		bpp;
sl@0
  1963
    ullong	fill;
sl@0
  1964
    __m64	vfill;
sl@0
  1965
    CARD32	byte_width;
sl@0
  1966
    CARD8	*byte_line;
sl@0
  1967
    FbBits      *bits;
sl@0
  1968
    int		xoff, yoff;
sl@0
  1969
    
sl@0
  1970
    CHECKPOINT();
sl@0
  1971
    
sl@0
  1972
    fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
sl@0
  1973
    
sl@0
  1974
    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
sl@0
  1975
	return FALSE;
sl@0
  1976
    
sl@0
  1977
    if (bpp != 16 && bpp != 32)
sl@0
  1978
	return FALSE;
sl@0
  1979
    
sl@0
  1980
    if (bpp == 16)
sl@0
  1981
    {
sl@0
  1982
	stride = stride * sizeof (FbBits) / 2;
sl@0
  1983
	byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
sl@0
  1984
	byte_width = 2 * width;
sl@0
  1985
	stride *= 2;
sl@0
  1986
    }
sl@0
  1987
    else
sl@0
  1988
    {
sl@0
  1989
	stride = stride * sizeof (FbBits) / 4;
sl@0
  1990
	byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
sl@0
  1991
	byte_width = 4 * width;
sl@0
  1992
	stride *= 4;
sl@0
  1993
    }
sl@0
  1994
    
sl@0
  1995
    fill = ((ullong)xor << 32) | xor;
sl@0
  1996
    vfill = (__m64)fill;
sl@0
  1997
    
sl@0
  1998
    while (height--)
sl@0
  1999
    {
sl@0
  2000
	int w;
sl@0
  2001
	CARD8 *d = byte_line;
sl@0
  2002
	byte_line += stride;
sl@0
  2003
	w = byte_width;
sl@0
  2004
	
sl@0
  2005
	while (w >= 2 && ((unsigned long)d & 3))
sl@0
  2006
	{
sl@0
  2007
	    *(CARD16 *)d = xor;
sl@0
  2008
	    w -= 2;
sl@0
  2009
	    d += 2;
sl@0
  2010
	}
sl@0
  2011
	
sl@0
  2012
	while (w >= 4 && ((unsigned long)d & 7))
sl@0
  2013
	{
sl@0
  2014
	    *(CARD32 *)d = xor;
sl@0
  2015
	    
sl@0
  2016
	    w -= 4;
sl@0
  2017
	    d += 4;
sl@0
  2018
	}
sl@0
  2019
	
sl@0
  2020
	while (w >= 64)
sl@0
  2021
	{
sl@0
  2022
	    *(__m64*) (d +  0) = vfill;
sl@0
  2023
	    *(__m64*) (d +  8) = vfill;
sl@0
  2024
	    *(__m64*) (d + 16) = vfill;
sl@0
  2025
	    *(__m64*) (d + 24) = vfill;
sl@0
  2026
	    *(__m64*) (d + 32) = vfill;
sl@0
  2027
	    *(__m64*) (d + 40) = vfill;
sl@0
  2028
	    *(__m64*) (d + 48) = vfill;
sl@0
  2029
	    *(__m64*) (d + 56) = vfill;
sl@0
  2030
	    
sl@0
  2031
	    w -= 64;
sl@0
  2032
	    d += 64;
sl@0
  2033
	}
sl@0
  2034
	while (w >= 4)
sl@0
  2035
	{
sl@0
  2036
	    *(CARD32 *)d = xor;
sl@0
  2037
	    
sl@0
  2038
	    w -= 4;
sl@0
  2039
	    d += 4;
sl@0
  2040
	}
sl@0
  2041
	if (w >= 2)
sl@0
  2042
	{
sl@0
  2043
	    *(CARD16 *)d = xor;
sl@0
  2044
	    w -= 2;
sl@0
  2045
	    d += 2;
sl@0
  2046
	}
sl@0
  2047
    }
sl@0
  2048
    
sl@0
  2049
    _mm_empty();
sl@0
  2050
    return TRUE;
sl@0
  2051
}
sl@0
  2052
sl@0
  2053
Bool
sl@0
  2054
fbCopyAreammx (DrawablePtr	pSrc,
sl@0
  2055
	       DrawablePtr	pDst,
sl@0
  2056
	       int		src_x,
sl@0
  2057
	       int		src_y,
sl@0
  2058
	       int		dst_x,
sl@0
  2059
	       int		dst_y,
sl@0
  2060
	       int		width,
sl@0
  2061
	       int		height)
sl@0
  2062
{
sl@0
  2063
    FbBits *	src_bits;
sl@0
  2064
    FbStride	src_stride;
sl@0
  2065
    int		src_bpp;
sl@0
  2066
    int		src_xoff;
sl@0
  2067
    int		src_yoff;
sl@0
  2068
sl@0
  2069
    FbBits *	dst_bits;
sl@0
  2070
    FbStride	dst_stride;
sl@0
  2071
    int		dst_bpp;
sl@0
  2072
    int		dst_xoff;
sl@0
  2073
    int		dst_yoff;
sl@0
  2074
sl@0
  2075
    CARD8 *	src_bytes;
sl@0
  2076
    CARD8 *	dst_bytes;
sl@0
  2077
    int		byte_width;
sl@0
  2078
    
sl@0
  2079
    fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
sl@0
  2080
    fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
sl@0
  2081
sl@0
  2082
    if (src_bpp != 16 && src_bpp != 32)
sl@0
  2083
	return FALSE;
sl@0
  2084
sl@0
  2085
    if (dst_bpp != 16 && dst_bpp != 32)
sl@0
  2086
	return FALSE;
sl@0
  2087
sl@0
  2088
    if (src_bpp != dst_bpp)
sl@0
  2089
    {
sl@0
  2090
	return FALSE;
sl@0
  2091
    }
sl@0
  2092
    
sl@0
  2093
    if (src_bpp == 16)
sl@0
  2094
    {
sl@0
  2095
	src_stride = src_stride * sizeof (FbBits) / 2;
sl@0
  2096
	dst_stride = dst_stride * sizeof (FbBits) / 2;
sl@0
  2097
	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
sl@0
  2098
	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
sl@0
  2099
	byte_width = 2 * width;
sl@0
  2100
	src_stride *= 2;
sl@0
  2101
	dst_stride *= 2;
sl@0
  2102
    }
sl@0
  2103
    else
sl@0
  2104
    {
sl@0
  2105
	src_stride = src_stride * sizeof (FbBits) / 4;
sl@0
  2106
	dst_stride = dst_stride * sizeof (FbBits) / 4;
sl@0
  2107
	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
sl@0
  2108
	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
sl@0
  2109
	byte_width = 4 * width;
sl@0
  2110
	src_stride *= 4;
sl@0
  2111
	dst_stride *= 4;
sl@0
  2112
    }
sl@0
  2113
sl@0
  2114
    while (height--)
sl@0
  2115
    {
sl@0
  2116
	int w;
sl@0
  2117
	CARD8 *s = src_bytes;
sl@0
  2118
	CARD8 *d = dst_bytes;
sl@0
  2119
	src_bytes += src_stride;
sl@0
  2120
	dst_bytes += dst_stride;
sl@0
  2121
	w = byte_width;
sl@0
  2122
	
sl@0
  2123
	while (w >= 2 && ((unsigned long)d & 3))
sl@0
  2124
	{
sl@0
  2125
	    *(CARD16 *)d = *(CARD16 *)s;
sl@0
  2126
	    w -= 2;
sl@0
  2127
	    s += 2;
sl@0
  2128
	    d += 2;
sl@0
  2129
	}
sl@0
  2130
	
sl@0
  2131
	while (w >= 4 && ((unsigned long)d & 7))
sl@0
  2132
	{
sl@0
  2133
	    *(CARD32 *)d = *(CARD32 *)s;
sl@0
  2134
	    
sl@0
  2135
	    w -= 4;
sl@0
  2136
	    s += 4;
sl@0
  2137
	    d += 4;
sl@0
  2138
	}
sl@0
  2139
	
sl@0
  2140
	while (w >= 64)
sl@0
  2141
	{
sl@0
  2142
	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
sl@0
  2143
	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
sl@0
  2144
	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
sl@0
  2145
	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
sl@0
  2146
	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
sl@0
  2147
	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
sl@0
  2148
	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
sl@0
  2149
	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
sl@0
  2150
	    w -= 64;
sl@0
  2151
	    s += 64;
sl@0
  2152
	    d += 64;
sl@0
  2153
	}
sl@0
  2154
	while (w >= 4)
sl@0
  2155
	{
sl@0
  2156
	    *(CARD32 *)d = *(CARD32 *)s;
sl@0
  2157
sl@0
  2158
	    w -= 4;
sl@0
  2159
	    s += 4;
sl@0
  2160
	    d += 4;
sl@0
  2161
	}
sl@0
  2162
	if (w >= 2)
sl@0
  2163
	{
sl@0
  2164
	    *(CARD16 *)d = *(CARD16 *)s;
sl@0
  2165
	    w -= 2;
sl@0
  2166
	    s += 2;
sl@0
  2167
	    d += 2;
sl@0
  2168
	}
sl@0
  2169
    }
sl@0
  2170
    
sl@0
  2171
    _mm_empty();
sl@0
  2172
    return TRUE;
sl@0
  2173
}
sl@0
  2174
sl@0
  2175
void
sl@0
  2176
fbCompositeCopyAreammx (CARD8		op,
sl@0
  2177
			PicturePtr	pSrc,
sl@0
  2178
			PicturePtr	pMask,
sl@0
  2179
			PicturePtr	pDst,
sl@0
  2180
			INT16		xSrc,
sl@0
  2181
			INT16		ySrc,
sl@0
  2182
			INT16		xMask,
sl@0
  2183
			INT16		yMask,
sl@0
  2184
			INT16		xDst,
sl@0
  2185
			INT16		yDst,
sl@0
  2186
			CARD16		width,
sl@0
  2187
			CARD16		height)
sl@0
  2188
{
sl@0
  2189
    fbCopyAreammx (pSrc->pDrawable,
sl@0
  2190
		   pDst->pDrawable,
sl@0
  2191
		   xSrc, ySrc,
sl@0
  2192
		   xDst, yDst,
sl@0
  2193
		   width, height);
sl@0
  2194
}
sl@0
  2195
sl@0
  2196
#if !defined(__amd64__) && !defined(__x86_64__)
sl@0
  2197
sl@0
  2198
enum CPUFeatures {
sl@0
  2199
    NoFeatures = 0,
sl@0
  2200
    MMX = 0x1,
sl@0
  2201
    MMX_Extensions = 0x2, 
sl@0
  2202
    SSE = 0x6,
sl@0
  2203
    SSE2 = 0x8,
sl@0
  2204
    CMOV = 0x10
sl@0
  2205
};
sl@0
  2206
sl@0
  2207
static unsigned int detectCPUFeatures(void) {
sl@0
  2208
    unsigned int result;
sl@0
  2209
    char vendor[13];
sl@0
  2210
    vendor[0] = 0;
sl@0
  2211
    vendor[12] = 0;
sl@0
  2212
    /* see p. 118 of amd64 instruction set manual Vol3 */
sl@0
  2213
    __asm__ ("push %%ebx\n"
sl@0
  2214
             "pushf\n"
sl@0
  2215
             "pop %%eax\n"
sl@0
  2216
             "mov %%eax, %%ebx\n"
sl@0
  2217
             "xor $0x00200000, %%eax\n"
sl@0
  2218
             "push %%eax\n"
sl@0
  2219
             "popf\n"
sl@0
  2220
             "pushf\n"
sl@0
  2221
             "pop %%eax\n"
sl@0
  2222
             "mov $0x0, %%edx\n"
sl@0
  2223
             "xor %%ebx, %%eax\n"
sl@0
  2224
             "jz skip\n"
sl@0
  2225
sl@0
  2226
             "mov $0x00000000, %%eax\n"
sl@0
  2227
             "cpuid\n"
sl@0
  2228
             "mov %%ebx, %1\n"
sl@0
  2229
             "mov %%edx, %2\n"
sl@0
  2230
             "mov %%ecx, %3\n"
sl@0
  2231
             "mov $0x00000001, %%eax\n"
sl@0
  2232
             "cpuid\n"
sl@0
  2233
             "skip:\n"
sl@0
  2234
             "pop %%ebx\n"
sl@0
  2235
             "mov %%edx, %0\n"
sl@0
  2236
             : "=r" (result), 
sl@0
  2237
               "=m" (vendor[0]), 
sl@0
  2238
               "=m" (vendor[4]), 
sl@0
  2239
               "=m" (vendor[8])
sl@0
  2240
             :
sl@0
  2241
             : "%eax", "%ecx", "%edx"
sl@0
  2242
        );
sl@0
  2243
sl@0
  2244
    unsigned int features = 0;
sl@0
  2245
    if (result) {
sl@0
  2246
        /* result now contains the standard feature bits */
sl@0
  2247
        if (result & (1 << 15))
sl@0
  2248
            features |= CMOV;
sl@0
  2249
        if (result & (1 << 23))
sl@0
  2250
            features |= MMX;
sl@0
  2251
        if (result & (1 << 25))
sl@0
  2252
            features |= SSE;
sl@0
  2253
        if (result & (1 << 26))
sl@0
  2254
            features |= SSE2;
sl@0
  2255
        if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
sl@0
  2256
            /* check for AMD MMX extensions */
sl@0
  2257
sl@0
  2258
            unsigned int result;            
sl@0
  2259
            __asm__("push %%ebx\n"
sl@0
  2260
                    "mov $0x80000000, %%eax\n"
sl@0
  2261
                    "cpuid\n"
sl@0
  2262
                    "xor %%edx, %%edx\n"
sl@0
  2263
                    "cmp $0x1, %%eax\n"
sl@0
  2264
                    "jge skip2\n"
sl@0
  2265
                    "mov $0x80000001, %%eax\n"
sl@0
  2266
                    "cpuid\n"
sl@0
  2267
                    "skip2:\n"
sl@0
  2268
                    "mov %%edx, %0\n"
sl@0
  2269
                    "pop %%ebx\n"
sl@0
  2270
                    : "=r" (result)
sl@0
  2271
                    :
sl@0
  2272
                    : "%eax", "%ecx", "%edx"
sl@0
  2273
                );
sl@0
  2274
            if (result & (1<<22))
sl@0
  2275
                features |= MMX_Extensions;
sl@0
  2276
        }
sl@0
  2277
    }
sl@0
  2278
    return features;
sl@0
  2279
}
sl@0
  2280
sl@0
  2281
Bool
sl@0
  2282
fbHaveMMX (void)
sl@0
  2283
{
sl@0
  2284
    static Bool initialized = FALSE;
sl@0
  2285
    static Bool mmx_present;
sl@0
  2286
    
sl@0
  2287
    if (!initialized)
sl@0
  2288
    {
sl@0
  2289
        unsigned int features = detectCPUFeatures();
sl@0
  2290
	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
sl@0
  2291
        initialized = TRUE;
sl@0
  2292
    }
sl@0
  2293
    
sl@0
  2294
    return mmx_present;
sl@0
  2295
}
sl@0
  2296
#endif /* __amd64__ */
sl@0
  2297
sl@0
  2298
sl@0
  2299
#endif
sl@0
  2300
sl@0
  2301
sl@0
  2302
#ifdef	__SYMBIAN32__
sl@0
  2303
 
sl@0
  2304
OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() {
sl@0
  2305
		return &_oil_function_impl_mmxCombineOverU, composite_over_argb;
sl@0
  2306
}
sl@0
  2307
#endif
sl@0
  2308
sl@0
  2309
#ifdef	__SYMBIAN32__
sl@0
  2310
 
sl@0
  2311
OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() {
sl@0
  2312
		return &_oil_function_impl_mmxCombineAddU, composite_add_argb;
sl@0
  2313
}
sl@0
  2314
#endif
sl@0
  2315
sl@0
  2316
#ifdef	__SYMBIAN32__
sl@0
  2317
 
sl@0
  2318
OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() {
sl@0
  2319
		return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src;
sl@0
  2320
}
sl@0
  2321
#endif
sl@0
  2322
sl@0
  2323
#ifdef	__SYMBIAN32__
sl@0
  2324
 
sl@0
  2325
OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() {
sl@0
  2326
		return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8;
sl@0
  2327
}
sl@0
  2328
#endif
sl@0
  2329
sl@0
  2330
#ifdef	__SYMBIAN32__
sl@0
  2331
 
sl@0
  2332
OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() {
sl@0
  2333
		return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb;
sl@0
  2334
}
sl@0
  2335
#endif
sl@0
  2336