os/ossrv/genericopenlibs/liboil/src/composite_sse_4pix.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
 * Copyright (c) 2005
sl@0
     3
 *	Eric Anholt.  All rights reserved.
sl@0
     4
 *
sl@0
     5
 * Redistribution and use in source and binary forms, with or without
sl@0
     6
 * modification, are permitted provided that the following conditions
sl@0
     7
 * are met:
sl@0
     8
 * 1. Redistributions of source code must retain the above copyright
sl@0
     9
 *    notice, this list of conditions and the following disclaimer.
sl@0
    10
 * 2. Redistributions in binary form must reproduce the above copyright
sl@0
    11
 *    notice, this list of conditions and the following disclaimer in the
sl@0
    12
 *    documentation and/or other materials provided with the distribution.
sl@0
    13
 *
sl@0
    14
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
sl@0
    15
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
sl@0
    16
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
sl@0
    17
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
sl@0
    18
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
sl@0
    19
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
sl@0
    20
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
sl@0
    21
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
sl@0
    22
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
sl@0
    23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
sl@0
    24
 * SUCH DAMAGE.
sl@0
    25
 */
sl@0
    26
sl@0
    27
#ifdef HAVE_CONFIG_H
sl@0
    28
#include "config.h"
sl@0
    29
#endif
sl@0
    30
#include <liboilclasses.h>
sl@0
    31
#include <liboilfunction.h>
sl@0
    32
#include <emmintrin.h>
sl@0
    33
#include "liboil/liboilcolorspace.h"
sl@0
    34
sl@0
    35
#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
sl@0
    36
sl@0
    37
#ifdef ENABLE_BROKEN_IMPLS
sl@0
    38
sl@0
    39
union m128_int {
sl@0
    40
  __m128i m128;
sl@0
    41
  uint64_t ull[2];
sl@0
    42
};
sl@0
    43
sl@0
    44
static const struct _SSEData {
sl@0
    45
  union m128_int sse_16xff;
sl@0
    46
  union m128_int sse_8x0080;
sl@0
    47
} c = {
sl@0
    48
    .sse_16xff.ull =	{0xffffffffffffffffULL, 0xffffffffffffffffULL},
sl@0
    49
    .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
sl@0
    50
};
sl@0
    51
sl@0
    52
#define MC(x) (c.sse_##x.m128)
sl@0
    53
sl@0
    54
/* non-SSE2 compositing support */
sl@0
    55
#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
sl@0
    56
#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
sl@0
    57
#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
sl@0
    58
sl@0
    59
/* This SSE2 code is based around operations on four pixels at a time.  The
sl@0
    60
 * exception is muldiv_255_sse2, which needs to expand the four pixels into
sl@0
    61
 * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
sl@0
    62
 * the appropriate rounding on division.
sl@0
    63
 */
sl@0
    64
sl@0
    65
/* Shuffles the given value such that the alpha for each pixel appears in each
sl@0
    66
 * channel of the pixel.
sl@0
    67
 */
sl@0
    68
SSE_FUNCTION static inline __m128i
sl@0
    69
argb_A_sse2(__m128i a)
sl@0
    70
{
sl@0
    71
#if 0
sl@0
    72
  /* Shift the alpha channel of each pixel into the low byte */
sl@0
    73
  a = _mm_srli_epi32(a, 24);
sl@0
    74
  /* Now, shift and or so we can get it into all the channels */
sl@0
    75
  a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
sl@0
    76
  a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
sl@0
    77
  return a;
sl@0
    78
#else
sl@0
    79
  /* Move the alpha channel into the low byte */
sl@0
    80
  a = _mm_srli_epi32(a, 24);
sl@0
    81
  /* Pack our four alpha channels down into the lower 32 bits */
sl@0
    82
  a = _mm_packus_epi16(a, _mm_setzero_si128());
sl@0
    83
  a = _mm_packus_epi16(a, _mm_setzero_si128());
sl@0
    84
  /* And expand it back out into four pixels of all channels the same */
sl@0
    85
  a = _mm_unpacklo_epi8(a, a);
sl@0
    86
  return _mm_unpacklo_epi16(a, a);
sl@0
    87
#endif
sl@0
    88
}
sl@0
    89
sl@0
    90
/* Multiplies the unpacked 16-bits-per-channel pixel data in a
sl@0
    91
 * channel-by-channel by b, and divides the result by 255, with rounding.
sl@0
    92
 */
sl@0
    93
SSE_FUNCTION static inline __m128i
sl@0
    94
inner_muldiv_255_sse2(__m128i a, __m128i b)
sl@0
    95
{
sl@0
    96
  __m128i ret;
sl@0
    97
  __m128i roundconst = MC(8x0080);
sl@0
    98
sl@0
    99
  ret = _mm_mullo_epi16(a, b);
sl@0
   100
  ret = _mm_adds_epu16(ret, roundconst);
sl@0
   101
  ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
sl@0
   102
  ret = _mm_srli_epi16(ret, 8);
sl@0
   103
sl@0
   104
  return ret;
sl@0
   105
}
sl@0
   106
sl@0
   107
SSE_FUNCTION static inline __m128i
sl@0
   108
muldiv_255_sse2(__m128i a, __m128i b)
sl@0
   109
{
sl@0
   110
  __m128i alow, blow, ahigh, bhigh, low, high;
sl@0
   111
sl@0
   112
  alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
sl@0
   113
  blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
sl@0
   114
  ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
sl@0
   115
  bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
sl@0
   116
  low = inner_muldiv_255_sse2(alow, blow);
sl@0
   117
  high = inner_muldiv_255_sse2(ahigh, bhigh);
sl@0
   118
  return _mm_packus_epi16(low, high);
sl@0
   119
}
sl@0
   120
sl@0
   121
SSE_FUNCTION static inline __m128i
sl@0
   122
negate_argb_sse2(__m128i a)
sl@0
   123
{
sl@0
   124
  return _mm_xor_si128(a, MC(16xff));
sl@0
   125
}
sl@0
   126
sl@0
   127
SSE_FUNCTION static inline __m128i
sl@0
   128
load_argb_sse2(const uint32_t *src)
sl@0
   129
{
sl@0
   130
  return _mm_loadu_si128((__m128i *)src);
sl@0
   131
}
sl@0
   132
sl@0
   133
SSE_FUNCTION static inline __m128i
sl@0
   134
set1_argb_sse2(uint32_t src)
sl@0
   135
{
sl@0
   136
  return _mm_set1_epi32(src);
sl@0
   137
}
sl@0
   138
sl@0
   139
SSE_FUNCTION static inline __m128i
sl@0
   140
load_u8_mask(const uint8_t *m)
sl@0
   141
{
sl@0
   142
  __m128i a;
sl@0
   143
  a = _mm_cvtsi32_si128(*(uint32_t *)m);
sl@0
   144
  a = _mm_unpacklo_epi8(a, a);
sl@0
   145
  a = _mm_unpacklo_epi16(a, a);
sl@0
   146
  return a;
sl@0
   147
}
sl@0
   148
sl@0
   149
SSE_FUNCTION static inline __m128i
sl@0
   150
set1_u8_mask(uint8_t m)
sl@0
   151
{
sl@0
   152
  return _mm_set1_epi8(m);
sl@0
   153
}
sl@0
   154
sl@0
   155
SSE_FUNCTION static void
sl@0
   156
store_argb_sse2(uint32_t *dest, __m128i pix)
sl@0
   157
{
sl@0
   158
  _mm_store_si128((__m128i *)dest, pix);
sl@0
   159
}
sl@0
   160
sl@0
   161
SSE_FUNCTION static __m128i 
sl@0
   162
over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
sl@0
   163
{
sl@0
   164
  return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
sl@0
   165
}
sl@0
   166
sl@0
   167
SSE_FUNCTION static void
sl@0
   168
composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
sl@0
   169
    int n)
sl@0
   170
{
sl@0
   171
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   172
    uint32_t s = *src++;
sl@0
   173
    uint8_t m = *mask++;
sl@0
   174
sl@0
   175
    *dest++ = oil_argb(
sl@0
   176
	COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   177
	COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   178
	COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   179
	COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   180
  }
sl@0
   181
  for (; n >= 4; n -= 4) {
sl@0
   182
    __m128i s, m;
sl@0
   183
    s = load_argb_sse2(src);
sl@0
   184
    m = load_u8_mask(mask);
sl@0
   185
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
sl@0
   186
    src += 4;
sl@0
   187
    mask += 4;
sl@0
   188
    dest += 4;
sl@0
   189
  }
sl@0
   190
  for (; n > 0; n--) {
sl@0
   191
    uint32_t s = *src++;
sl@0
   192
    uint8_t m = *mask++;
sl@0
   193
sl@0
   194
    *dest++ = oil_argb(
sl@0
   195
	COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   196
	COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   197
	COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   198
	COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   199
  }
sl@0
   200
}
sl@0
   201
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
sl@0
   202
    OIL_IMPL_FLAG_SSE2);
sl@0
   203
sl@0
   204
SSE_FUNCTION static void
sl@0
   205
composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
sl@0
   206
    const uint8_t *mask, int n)
sl@0
   207
{
sl@0
   208
  __m128i s;
sl@0
   209
sl@0
   210
  s = set1_argb_sse2(*src);
sl@0
   211
sl@0
   212
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   213
    uint8_t m = *mask++;
sl@0
   214
sl@0
   215
    *dest++ = oil_argb(
sl@0
   216
	COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   217
	COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   218
	COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   219
	COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   220
  }
sl@0
   221
  for (; n >= 4; n -= 4) {
sl@0
   222
    __m128i m;
sl@0
   223
    m = load_u8_mask(mask);
sl@0
   224
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
sl@0
   225
    mask += 4;
sl@0
   226
    dest += 4;
sl@0
   227
  }
sl@0
   228
  for (; n > 0; n--) {
sl@0
   229
    uint8_t m = *mask++;
sl@0
   230
sl@0
   231
    *dest++ = oil_argb(
sl@0
   232
	COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   233
	COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   234
	COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   235
	COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   236
  }
sl@0
   237
}
sl@0
   238
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
sl@0
   239
    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   240
sl@0
   241
SSE_FUNCTION static void
sl@0
   242
composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
sl@0
   243
    const uint8_t *mask, int n)
sl@0
   244
{
sl@0
   245
  __m128i m;
sl@0
   246
sl@0
   247
  m = set1_u8_mask(*mask);
sl@0
   248
sl@0
   249
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   250
    uint32_t s = *src++;
sl@0
   251
sl@0
   252
    *dest++ = oil_argb(
sl@0
   253
	COMPOSITE_IN(oil_argb_A(s), mask[0]),
sl@0
   254
	COMPOSITE_IN(oil_argb_R(s), mask[0]),
sl@0
   255
	COMPOSITE_IN(oil_argb_G(s), mask[0]),
sl@0
   256
	COMPOSITE_IN(oil_argb_B(s), mask[0]));
sl@0
   257
  }
sl@0
   258
  for (; n >= 4; n -= 4) {
sl@0
   259
    __m128i s;
sl@0
   260
    s = load_argb_sse2(src);
sl@0
   261
    store_argb_sse2(dest,  muldiv_255_sse2(s, m));
sl@0
   262
    src += 4;
sl@0
   263
    dest += 4;
sl@0
   264
  }
sl@0
   265
  for (; n > 0; n--) {
sl@0
   266
    uint32_t s = *src++;
sl@0
   267
sl@0
   268
    *dest++ = oil_argb(
sl@0
   269
	COMPOSITE_IN(oil_argb_A(s), mask[0]),
sl@0
   270
	COMPOSITE_IN(oil_argb_R(s), mask[0]),
sl@0
   271
	COMPOSITE_IN(oil_argb_G(s), mask[0]),
sl@0
   272
	COMPOSITE_IN(oil_argb_B(s), mask[0]));
sl@0
   273
  }
sl@0
   274
}
sl@0
   275
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
sl@0
   276
    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
sl@0
   277
sl@0
   278
SSE_FUNCTION static void
sl@0
   279
composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
sl@0
   280
{
sl@0
   281
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   282
    uint32_t d = *dest, s = *src++;
sl@0
   283
    uint8_t srca = oil_argb_A(s);
sl@0
   284
    d = oil_argb(
sl@0
   285
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
sl@0
   286
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
sl@0
   287
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
sl@0
   288
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
sl@0
   289
    *dest++ = d;
sl@0
   290
  }
sl@0
   291
  for (; n >= 4; n -= 4) {
sl@0
   292
    __m128i d, s;
sl@0
   293
    s = load_argb_sse2(src);
sl@0
   294
    d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
sl@0
   295
    store_argb_sse2(dest, d);
sl@0
   296
    src += 4;
sl@0
   297
    dest += 4;
sl@0
   298
  }
sl@0
   299
  for (; n > 0; n--) {
sl@0
   300
    uint32_t d = *dest, s = *src++;
sl@0
   301
    uint8_t srca = oil_argb_A(s);
sl@0
   302
    d = oil_argb(
sl@0
   303
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
sl@0
   304
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
sl@0
   305
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
sl@0
   306
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
sl@0
   307
    *dest++ = d;
sl@0
   308
  }
sl@0
   309
}
sl@0
   310
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
sl@0
   311
    OIL_IMPL_FLAG_SSE2);
sl@0
   312
sl@0
   313
SSE_FUNCTION static void
sl@0
   314
composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
sl@0
   315
{
sl@0
   316
  __m128i s, sa;
sl@0
   317
  uint32_t srca;
sl@0
   318
sl@0
   319
  srca = oil_argb_A(*src);
sl@0
   320
  s = set1_argb_sse2(*src);
sl@0
   321
  sa = negate_argb_sse2(argb_A_sse2(s));
sl@0
   322
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   323
    uint32_t d = *dest;
sl@0
   324
    d = oil_argb(
sl@0
   325
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
sl@0
   326
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
sl@0
   327
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
sl@0
   328
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
sl@0
   329
    *dest++ = d;
sl@0
   330
  }
sl@0
   331
  for (; n >= 4; n -= 4) {
sl@0
   332
    __m128i d;
sl@0
   333
    d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
sl@0
   334
    store_argb_sse2(dest, d);
sl@0
   335
    dest += 4;
sl@0
   336
  }
sl@0
   337
  for (; n > 0; n--) {
sl@0
   338
    uint32_t d = *dest;
sl@0
   339
    d = oil_argb(
sl@0
   340
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
sl@0
   341
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
sl@0
   342
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
sl@0
   343
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
sl@0
   344
    *dest++ = d;
sl@0
   345
  }
sl@0
   346
}
sl@0
   347
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
sl@0
   348
    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   349
sl@0
   350
SSE_FUNCTION static void
sl@0
   351
composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
sl@0
   352
    const uint8_t *mask, int n)
sl@0
   353
{
sl@0
   354
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   355
    uint32_t d = *dest, s = *src++, m = *mask++, color;
sl@0
   356
    uint8_t srca;
sl@0
   357
sl@0
   358
    color = oil_argb(
sl@0
   359
        COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   360
        COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   361
        COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   362
        COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   363
    srca = oil_argb_A(color);
sl@0
   364
    d = oil_argb(
sl@0
   365
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   366
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   367
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   368
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   369
    *dest++ = d;
sl@0
   370
  }
sl@0
   371
  for (; n >= 4; n -= 4) {
sl@0
   372
    __m128i d, s, m;
sl@0
   373
    s = load_argb_sse2(src);
sl@0
   374
    m = load_u8_mask(mask);
sl@0
   375
    s = muldiv_255_sse2(s, m);
sl@0
   376
    d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
sl@0
   377
    store_argb_sse2(dest, d);
sl@0
   378
    src += 4;
sl@0
   379
    mask += 4;
sl@0
   380
    dest += 4;
sl@0
   381
  }
sl@0
   382
  for (; n > 0; n--) {
sl@0
   383
    uint32_t d = *dest, s = *src++, m = *mask++, color;
sl@0
   384
    uint8_t srca;
sl@0
   385
sl@0
   386
    color = oil_argb(
sl@0
   387
        COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   388
        COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   389
        COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   390
        COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   391
    srca = oil_argb_A(color);
sl@0
   392
    d = oil_argb(
sl@0
   393
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   394
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   395
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   396
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   397
    *dest++ = d;
sl@0
   398
  }
sl@0
   399
}
sl@0
   400
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
sl@0
   401
    OIL_IMPL_FLAG_SSE2);
sl@0
   402
sl@0
   403
SSE_FUNCTION static void
sl@0
   404
composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
sl@0
   405
    const uint8_t *mask, int n)
sl@0
   406
{
sl@0
   407
  __m128i s;
sl@0
   408
sl@0
   409
  s = set1_argb_sse2(*src);
sl@0
   410
sl@0
   411
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   412
    uint32_t d = *dest, m = *mask++, color;
sl@0
   413
    uint8_t srca;
sl@0
   414
sl@0
   415
    color = oil_argb(
sl@0
   416
        COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   417
        COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   418
        COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   419
        COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   420
    srca = oil_argb_A(color);
sl@0
   421
    d = oil_argb(
sl@0
   422
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   423
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   424
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   425
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   426
    *dest++ = d;
sl@0
   427
  }
sl@0
   428
  for (; n >= 4; n -= 4) {
sl@0
   429
    __m128i d, color, m;
sl@0
   430
    m = load_u8_mask(mask);
sl@0
   431
    color = muldiv_255_sse2(s, m);
sl@0
   432
    d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
sl@0
   433
    store_argb_sse2(dest, d);
sl@0
   434
    mask += 4;
sl@0
   435
    dest += 4;
sl@0
   436
  }
sl@0
   437
  for (; n > 0; n--) {
sl@0
   438
    uint32_t d = *dest, m = *mask++, color;
sl@0
   439
    uint8_t srca;
sl@0
   440
sl@0
   441
    color = oil_argb(
sl@0
   442
        COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   443
        COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   444
        COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   445
        COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   446
    srca = oil_argb_A(color);
sl@0
   447
    d = oil_argb(
sl@0
   448
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   449
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   450
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   451
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   452
    *dest++ = d;
sl@0
   453
  }
sl@0
   454
}
sl@0
   455
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
sl@0
   456
    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   457
sl@0
   458
SSE_FUNCTION static void
sl@0
   459
composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
sl@0
   460
    const uint8_t *mask, int n)
sl@0
   461
{
sl@0
   462
  __m128i m;
sl@0
   463
sl@0
   464
  m = set1_u8_mask(*mask);
sl@0
   465
sl@0
   466
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   467
    uint32_t d = *dest, s = *src++, color;
sl@0
   468
    uint8_t srca;
sl@0
   469
sl@0
   470
    color = oil_argb(
sl@0
   471
        COMPOSITE_IN(oil_argb_A(s), *mask),
sl@0
   472
        COMPOSITE_IN(oil_argb_R(s), *mask),
sl@0
   473
        COMPOSITE_IN(oil_argb_G(s), *mask),
sl@0
   474
        COMPOSITE_IN(oil_argb_B(s), *mask));
sl@0
   475
    srca = oil_argb_A(color);
sl@0
   476
    d = oil_argb(
sl@0
   477
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   478
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   479
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   480
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   481
    *dest++ = d;
sl@0
   482
  }
sl@0
   483
  for (; n >= 4; n -= 4) {
sl@0
   484
    __m128i d, s;
sl@0
   485
    s = load_argb_sse2(src);
sl@0
   486
    s = muldiv_255_sse2(s, m);
sl@0
   487
    d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
sl@0
   488
    store_argb_sse2(dest, d);
sl@0
   489
    src += 4;
sl@0
   490
    dest += 4;
sl@0
   491
  }
sl@0
   492
  for (; n > 0; n--) {
sl@0
   493
    uint32_t d = *dest, s = *src++, color;
sl@0
   494
    uint8_t srca;
sl@0
   495
sl@0
   496
    color = oil_argb(
sl@0
   497
        COMPOSITE_IN(oil_argb_A(s), *mask),
sl@0
   498
        COMPOSITE_IN(oil_argb_R(s), *mask),
sl@0
   499
        COMPOSITE_IN(oil_argb_G(s), *mask),
sl@0
   500
        COMPOSITE_IN(oil_argb_B(s), *mask));
sl@0
   501
    srca = oil_argb_A(color);
sl@0
   502
    d = oil_argb(
sl@0
   503
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   504
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   505
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   506
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   507
    *dest++ = d;
sl@0
   508
  }
sl@0
   509
}
sl@0
   510
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
sl@0
   511
    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
sl@0
   512
sl@0
   513
SSE_FUNCTION static void
sl@0
   514
composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
sl@0
   515
{
sl@0
   516
  /* Initial operations to align the destination pointer */
sl@0
   517
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   518
    *dest = COMPOSITE_OVER(*dest, *src, *src);
sl@0
   519
    src++;
sl@0
   520
    dest++;
sl@0
   521
  }
sl@0
   522
  /* over_u8 can be dealt with using our argb code, with srca = s */
sl@0
   523
  for (; n >= 16; n -= 16) {
sl@0
   524
    __m128i d, s;
sl@0
   525
    d = *(__m128i *)dest;
sl@0
   526
    s = load_argb_sse2((uint32_t *)src);
sl@0
   527
    store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
sl@0
   528
    src += 16;
sl@0
   529
    dest += 16;
sl@0
   530
  }
sl@0
   531
  for (; n > 0; n--) {
sl@0
   532
    *dest = COMPOSITE_OVER(*dest, *src, *src);
sl@0
   533
    src++;
sl@0
   534
    dest++;
sl@0
   535
  }
sl@0
   536
}
sl@0
   537
OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,
sl@0
   538
    OIL_IMPL_FLAG_SSE2);
sl@0
   539
sl@0
   540
#endif
sl@0
   541