os/ossrv/genericopenlibs/liboil/src/composite_sse_2pix.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 * Copyright (c) 2005
sl@0
     3
 *	Eric Anholt.  All rights reserved.
sl@0
     4
 *
sl@0
     5
 * Redistribution and use in source and binary forms, with or without
sl@0
     6
 * modification, are permitted provided that the following conditions
sl@0
     7
 * are met:
sl@0
     8
 * 1. Redistributions of source code must retain the above copyright
sl@0
     9
 *    notice, this list of conditions and the following disclaimer.
sl@0
    10
 * 2. Redistributions in binary form must reproduce the above copyright
sl@0
    11
 *    notice, this list of conditions and the following disclaimer in the
sl@0
    12
 *    documentation and/or other materials provided with the distribution.
sl@0
    13
 *
sl@0
    14
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
sl@0
    15
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
sl@0
    16
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
sl@0
    17
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
sl@0
    18
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
sl@0
    19
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
sl@0
    20
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
sl@0
    21
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
sl@0
    22
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
sl@0
    23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
sl@0
    24
 * SUCH DAMAGE.
sl@0
    25
 */
sl@0
    26
sl@0
    27
#ifdef HAVE_CONFIG_H
sl@0
    28
#include "config.h"
sl@0
    29
#endif
sl@0
    30
#include <liboilclasses.h>
sl@0
    31
#include <liboilfunction.h>
sl@0
    32
#include <emmintrin.h>
sl@0
    33
#include "liboil/liboilcolorspace.h"
sl@0
    34
sl@0
    35
#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
sl@0
    36
sl@0
    37
/* non-SSE2 compositing support */
sl@0
    38
#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
sl@0
    39
#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
sl@0
    40
#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
sl@0
    41
sl@0
    42
/* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
sl@0
    43
 * the channel value in the low byte.  This means 2 pixels per pass.
sl@0
    44
 */
sl@0
    45
sl@0
    46
#ifdef ENABLE_BROKEN_IMPLS
sl@0
    47
sl@0
    48
union m128_int {
sl@0
    49
  __m128i m128;
sl@0
    50
  uint64_t ull[2];
sl@0
    51
};
sl@0
    52
sl@0
    53
static const struct _SSEData {
sl@0
    54
  union m128_int sse_8x00ff;
sl@0
    55
  union m128_int sse_8x0080;
sl@0
    56
} c = {
sl@0
    57
    .sse_8x00ff.ull =	{0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
sl@0
    58
    .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
sl@0
    59
};
sl@0
    60
sl@0
    61
#define MC(x) (c.sse_##x.m128)
sl@0
    62
sl@0
    63
/* Shuffles the given value such that the alpha for each pixel appears in each
sl@0
    64
 * channel of the pixel.
sl@0
    65
 */
sl@0
    66
SSE_FUNCTION static inline __m128i
sl@0
    67
argb_A_sse2(__m128i a)
sl@0
    68
{
sl@0
    69
  a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
sl@0
    70
  a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
sl@0
    71
  return a;
sl@0
    72
}
sl@0
    73
sl@0
    74
/* Multiplies the pixel data in a channel-by-channel by b, and divides the
sl@0
    75
 * result by 255, with rounding.
sl@0
    76
 */
sl@0
    77
SSE_FUNCTION static inline __m128i
sl@0
    78
muldiv_255_sse2(__m128i a, __m128i b)
sl@0
    79
{
sl@0
    80
  __m128i ret;
sl@0
    81
  __m128i roundconst = MC(8x0080);
sl@0
    82
sl@0
    83
  ret = _mm_mullo_epi16(a, b);
sl@0
    84
  ret = _mm_adds_epu16(ret, roundconst);
sl@0
    85
  ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
sl@0
    86
  ret = _mm_srli_epi16(ret, 8);
sl@0
    87
sl@0
    88
  return ret;
sl@0
    89
}
sl@0
    90
sl@0
    91
SSE_FUNCTION static inline __m128i
sl@0
    92
negate_argb_sse2(__m128i a)
sl@0
    93
{
sl@0
    94
  return _mm_xor_si128(a, MC(8x00ff));
sl@0
    95
}
sl@0
    96
sl@0
    97
/* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
sl@0
    98
SSE_FUNCTION static inline __m128i
sl@0
    99
load_argb_sse2(const uint32_t *src)
sl@0
   100
{
sl@0
   101
  __m128i pix;
sl@0
   102
sl@0
   103
  pix = _mm_loadl_epi64((__m128i *)src);
sl@0
   104
  pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
sl@0
   105
  return pix;
sl@0
   106
}
sl@0
   107
sl@0
   108
SSE_FUNCTION static inline __m128i
sl@0
   109
set1_argb_sse2(uint32_t src)
sl@0
   110
{
sl@0
   111
  __m128i pix;
sl@0
   112
sl@0
   113
  pix = _mm_set1_epi32(src);
sl@0
   114
  pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
sl@0
   115
  return pix;
sl@0
   116
}
sl@0
   117
sl@0
   118
SSE_FUNCTION static inline __m128i
sl@0
   119
load_u8_mask(const uint8_t *m)
sl@0
   120
{
sl@0
   121
  return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
sl@0
   122
}
sl@0
   123
sl@0
   124
SSE_FUNCTION static inline __m128i
sl@0
   125
set1_u8_mask(uint8_t m)
sl@0
   126
{
sl@0
   127
  return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
sl@0
   128
}
sl@0
   129
sl@0
   130
/* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
sl@0
   131
SSE_FUNCTION static void
sl@0
   132
store_argb_sse2(uint32_t *dest, __m128i pix)
sl@0
   133
{
sl@0
   134
  pix = _mm_packus_epi16(pix, pix);
sl@0
   135
  _mm_storel_epi64((__m128i *)dest, pix);
sl@0
   136
}
sl@0
   137
sl@0
   138
SSE_FUNCTION static __m128i 
sl@0
   139
over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
sl@0
   140
{
sl@0
   141
  return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
sl@0
   142
}
sl@0
   143
sl@0
   144
SSE_FUNCTION static void
sl@0
   145
composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   146
    const uint8_t *mask, int n)
sl@0
   147
{
sl@0
   148
  for (; n >= 2; n -= 2) {
sl@0
   149
    __m128i s, m;
sl@0
   150
    s = load_argb_sse2(src);
sl@0
   151
    m = load_u8_mask(mask);
sl@0
   152
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
sl@0
   153
    src += 2;
sl@0
   154
    mask += 2;
sl@0
   155
    dest += 2;
sl@0
   156
  }
sl@0
   157
  for (; n > 0; n--) {
sl@0
   158
    uint32_t s = *src++;
sl@0
   159
    uint8_t m = *mask++;
sl@0
   160
sl@0
   161
    *dest++ = oil_argb(
sl@0
   162
	COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   163
	COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   164
	COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   165
	COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   166
  }
sl@0
   167
}
sl@0
   168
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
sl@0
   169
    OIL_IMPL_FLAG_SSE2);
sl@0
   170
sl@0
   171
SSE_FUNCTION static void
sl@0
   172
composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   173
    const uint8_t *mask, int n)
sl@0
   174
{
sl@0
   175
  __m128i s;
sl@0
   176
sl@0
   177
  s = set1_argb_sse2(*src);
sl@0
   178
sl@0
   179
  for (; n >= 2; n -= 2) {
sl@0
   180
    __m128i m;
sl@0
   181
    m = load_u8_mask(mask);
sl@0
   182
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
sl@0
   183
    mask += 2;
sl@0
   184
    dest += 2;
sl@0
   185
  }
sl@0
   186
  for (; n > 0; n--) {
sl@0
   187
    uint8_t m = *mask++;
sl@0
   188
sl@0
   189
    *dest++ = oil_argb(
sl@0
   190
	COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   191
	COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   192
	COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   193
	COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   194
  }
sl@0
   195
}
sl@0
   196
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
sl@0
   197
    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   198
sl@0
   199
#ifdef SSE_ALIGN
sl@0
   200
SSE_FUNCTION static void
sl@0
   201
composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   202
    const uint8_t *mask, int n)
sl@0
   203
{
sl@0
   204
  __m128i m;
sl@0
   205
sl@0
   206
  m = set1_u8_mask(*mask);
sl@0
   207
sl@0
   208
  for (; n >= 2; n -= 2) {
sl@0
   209
    __m128i s;
sl@0
   210
    s = load_argb_sse2(src);
sl@0
   211
    store_argb_sse2(dest,  muldiv_255_sse2(s, m));
sl@0
   212
    src += 2;
sl@0
   213
    dest += 2;
sl@0
   214
  }
sl@0
   215
  for (; n > 0; n--) {
sl@0
   216
    uint32_t s = *src++;
sl@0
   217
sl@0
   218
    *dest++ = oil_argb(
sl@0
   219
	COMPOSITE_IN(oil_argb_A(s), mask[0]),
sl@0
   220
	COMPOSITE_IN(oil_argb_R(s), mask[0]),
sl@0
   221
	COMPOSITE_IN(oil_argb_G(s), mask[0]),
sl@0
   222
	COMPOSITE_IN(oil_argb_B(s), mask[0]));
sl@0
   223
  }
sl@0
   224
}
sl@0
   225
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
sl@0
   226
    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
sl@0
   227
#endif
sl@0
   228
sl@0
   229
SSE_FUNCTION static void
sl@0
   230
composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
sl@0
   231
{
sl@0
   232
  for (; n >= 2; n -= 2) {
sl@0
   233
    __m128i d, s;
sl@0
   234
    s = load_argb_sse2(src);
sl@0
   235
    d = load_argb_sse2(dest);
sl@0
   236
    d = over_argb_sse2(d, s, argb_A_sse2(s));
sl@0
   237
    store_argb_sse2(dest, d);
sl@0
   238
    src += 2;
sl@0
   239
    dest += 2;
sl@0
   240
  }
sl@0
   241
  for (; n > 0; n--) {
sl@0
   242
    uint32_t d = *dest, s = *src++;
sl@0
   243
    uint8_t srca = oil_argb_A(s);
sl@0
   244
    d = oil_argb(
sl@0
   245
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
sl@0
   246
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
sl@0
   247
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
sl@0
   248
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
sl@0
   249
    *dest++ = d;
sl@0
   250
  }
sl@0
   251
}
sl@0
   252
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
sl@0
   253
    OIL_IMPL_FLAG_SSE2);
sl@0
   254
sl@0
   255
SSE_FUNCTION static void
sl@0
   256
composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   257
    int n)
sl@0
   258
{
sl@0
   259
  __m128i s, sa;
sl@0
   260
  uint32_t srca;
sl@0
   261
sl@0
   262
  srca = oil_argb_A(*src);
sl@0
   263
  s = set1_argb_sse2(*src);
sl@0
   264
  sa = negate_argb_sse2(argb_A_sse2(s));
sl@0
   265
  for (; n >= 2; n -= 2) {
sl@0
   266
    __m128i d;
sl@0
   267
    d = load_argb_sse2(dest);
sl@0
   268
    d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
sl@0
   269
    store_argb_sse2(dest, d);
sl@0
   270
    dest += 2;
sl@0
   271
  }
sl@0
   272
  for (; n > 0; n--) {
sl@0
   273
    uint32_t d = *dest;
sl@0
   274
    d = oil_argb(
sl@0
   275
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
sl@0
   276
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
sl@0
   277
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
sl@0
   278
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
sl@0
   279
    *dest++ = d;
sl@0
   280
  }
sl@0
   281
}
sl@0
   282
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
sl@0
   283
    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   284
sl@0
   285
SSE_FUNCTION static void
sl@0
   286
composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   287
    const uint8_t *mask, int n)
sl@0
   288
{
sl@0
   289
  for (; n >= 2; n -= 2) {
sl@0
   290
    __m128i d, s, m;
sl@0
   291
    s = load_argb_sse2(src);
sl@0
   292
    m = load_u8_mask(mask);
sl@0
   293
    d = load_argb_sse2(dest);
sl@0
   294
    s = muldiv_255_sse2(s, m);
sl@0
   295
    d = over_argb_sse2(d, s, argb_A_sse2(s));
sl@0
   296
    store_argb_sse2(dest, d);
sl@0
   297
    src += 2;
sl@0
   298
    mask += 2;
sl@0
   299
    dest += 2;
sl@0
   300
  }
sl@0
   301
  for (; n > 0; n--) {
sl@0
   302
    uint32_t d = *dest, s = *src++, m = *mask++, color;
sl@0
   303
    uint8_t srca;
sl@0
   304
sl@0
   305
    color = oil_argb(
sl@0
   306
        COMPOSITE_IN(oil_argb_A(s), m),
sl@0
   307
        COMPOSITE_IN(oil_argb_R(s), m),
sl@0
   308
        COMPOSITE_IN(oil_argb_G(s), m),
sl@0
   309
        COMPOSITE_IN(oil_argb_B(s), m));
sl@0
   310
    srca = oil_argb_A(color);
sl@0
   311
    d = oil_argb(
sl@0
   312
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   313
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   314
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   315
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   316
    *dest++ = d;
sl@0
   317
  }
sl@0
   318
}
sl@0
   319
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
sl@0
   320
    OIL_IMPL_FLAG_SSE2);
sl@0
   321
sl@0
   322
SSE_FUNCTION static void
sl@0
   323
composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   324
    const uint8_t *mask, int n)
sl@0
   325
{
sl@0
   326
  __m128i s;
sl@0
   327
sl@0
   328
  s = set1_argb_sse2(*src);
sl@0
   329
sl@0
   330
  for (; n >= 2; n -= 2) {
sl@0
   331
    __m128i d, color, m;
sl@0
   332
    m = load_u8_mask(mask);
sl@0
   333
    d = load_argb_sse2(dest);
sl@0
   334
    color = muldiv_255_sse2(s, m);
sl@0
   335
    d = over_argb_sse2(d, color, argb_A_sse2(color));
sl@0
   336
    store_argb_sse2(dest, d);
sl@0
   337
    mask += 2;
sl@0
   338
    dest += 2;
sl@0
   339
  }
sl@0
   340
  for (; n > 0; n--) {
sl@0
   341
    uint32_t d = *dest, m = *mask++, color;
sl@0
   342
    uint8_t srca;
sl@0
   343
sl@0
   344
    color = oil_argb(
sl@0
   345
        COMPOSITE_IN(oil_argb_A(*src), m),
sl@0
   346
        COMPOSITE_IN(oil_argb_R(*src), m),
sl@0
   347
        COMPOSITE_IN(oil_argb_G(*src), m),
sl@0
   348
        COMPOSITE_IN(oil_argb_B(*src), m));
sl@0
   349
    srca = oil_argb_A(color);
sl@0
   350
    d = oil_argb(
sl@0
   351
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   352
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   353
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   354
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   355
    *dest++ = d;
sl@0
   356
  }
sl@0
   357
}
sl@0
   358
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
sl@0
   359
    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
sl@0
   360
sl@0
   361
SSE_FUNCTION static void
sl@0
   362
composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
sl@0
   363
    const uint8_t *mask, int n)
sl@0
   364
{
sl@0
   365
  __m128i m;
sl@0
   366
sl@0
   367
  m = set1_u8_mask(*mask);
sl@0
   368
sl@0
   369
  for (; n >= 2; n -= 2) {
sl@0
   370
    __m128i d, s;
sl@0
   371
    s = load_argb_sse2(src);
sl@0
   372
    d = load_argb_sse2(dest);
sl@0
   373
    s = muldiv_255_sse2(s, m);
sl@0
   374
    d = over_argb_sse2(d, s, argb_A_sse2(s));
sl@0
   375
    store_argb_sse2(dest, d);
sl@0
   376
    src += 2;
sl@0
   377
    dest += 2;
sl@0
   378
  }
sl@0
   379
  for (; n > 0; n--) {
sl@0
   380
    uint32_t d = *dest, s = *src++, color;
sl@0
   381
    uint8_t srca;
sl@0
   382
sl@0
   383
    color = oil_argb(
sl@0
   384
        COMPOSITE_IN(oil_argb_A(s), *mask),
sl@0
   385
        COMPOSITE_IN(oil_argb_R(s), *mask),
sl@0
   386
        COMPOSITE_IN(oil_argb_G(s), *mask),
sl@0
   387
        COMPOSITE_IN(oil_argb_B(s), *mask));
sl@0
   388
    srca = oil_argb_A(color);
sl@0
   389
    d = oil_argb(
sl@0
   390
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
sl@0
   391
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
sl@0
   392
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
sl@0
   393
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
sl@0
   394
    *dest++ = d;
sl@0
   395
  }
sl@0
   396
}
sl@0
   397
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
sl@0
   398
    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
sl@0
   399
sl@0
   400
SSE_FUNCTION static void
sl@0
   401
composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
sl@0
   402
{
sl@0
   403
  /* Initial operations to align the destination pointer */
sl@0
   404
  for (; ((long)dest & 15) && (n > 0); n--) {
sl@0
   405
    *dest = COMPOSITE_OVER(*dest, *src, *src);
sl@0
   406
    src++;
sl@0
   407
    dest++;
sl@0
   408
  }
sl@0
   409
  /* over_u8 can be dealt with using our argb code, with srca = s */
sl@0
   410
  for (; n >= 8; n -= 8) {
sl@0
   411
    __m128i d, s;
sl@0
   412
    d = load_argb_sse2((uint32_t *)dest);
sl@0
   413
    s = load_argb_sse2((uint32_t *)src);
sl@0
   414
    store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
sl@0
   415
    src += 8;
sl@0
   416
    dest += 8;
sl@0
   417
  }
sl@0
   418
  for (; n > 0; n--) {
sl@0
   419
    *dest = COMPOSITE_OVER(*dest, *src, *src);
sl@0
   420
    src++;
sl@0
   421
    dest++;
sl@0
   422
  }
sl@0
   423
}
sl@0
   424
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
sl@0
   425
    OIL_IMPL_FLAG_SSE2);
sl@0
   426
#endif
sl@0
   427