os/ossrv/genericopenlibs/liboil/src/composite_sse_4pix.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2  * Copyright (c) 2005
     3  *	Eric Anholt.  All rights reserved.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  *
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    24  * SUCH DAMAGE.
    25  */
    26 
    27 #ifdef HAVE_CONFIG_H
    28 #include "config.h"
    29 #endif
    30 #include <liboilclasses.h>
    31 #include <liboilfunction.h>
    32 #include <emmintrin.h>
    33 #include "liboil/liboilcolorspace.h"
    34 
    35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
    36 
    37 #ifdef ENABLE_BROKEN_IMPLS
    38 
    39 union m128_int {
    40   __m128i m128;
    41   uint64_t ull[2];
    42 };
    43 
    44 static const struct _SSEData {
    45   union m128_int sse_16xff;
    46   union m128_int sse_8x0080;
    47 } c = {
    48     .sse_16xff.ull =	{0xffffffffffffffffULL, 0xffffffffffffffffULL},
    49     .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
    50 };
    51 
    52 #define MC(x) (c.sse_##x.m128)
    53 
    54 /* non-SSE2 compositing support */
    55 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
    56 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
    57 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
    58 
    59 /* This SSE2 code is based around operations on four pixels at a time.  The
    60  * exception is muldiv_255_sse2, which needs to expand the four pixels into
    61  * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
    62  * the appropriate rounding on division.
    63  */
    64 
    65 /* Shuffles the given value such that the alpha for each pixel appears in each
    66  * channel of the pixel.
    67  */
    68 SSE_FUNCTION static inline __m128i
    69 argb_A_sse2(__m128i a)
    70 {
    71 #if 0
    72   /* Shift the alpha channel of each pixel into the low byte */
    73   a = _mm_srli_epi32(a, 24);
    74   /* Now, shift and or so we can get it into all the channels */
    75   a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
    76   a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
    77   return a;
    78 #else
    79   /* Move the alpha channel into the low byte */
    80   a = _mm_srli_epi32(a, 24);
    81   /* Pack our four alpha channels down into the lower 32 bits */
    82   a = _mm_packus_epi16(a, _mm_setzero_si128());
    83   a = _mm_packus_epi16(a, _mm_setzero_si128());
    84   /* And expand it back out into four pixels of all channels the same */
    85   a = _mm_unpacklo_epi8(a, a);
    86   return _mm_unpacklo_epi16(a, a);
    87 #endif
    88 }
    89 
    90 /* Multiplies the unpacked 16-bits-per-channel pixel data in a
    91  * channel-by-channel by b, and divides the result by 255, with rounding.
    92  */
    93 SSE_FUNCTION static inline __m128i
    94 inner_muldiv_255_sse2(__m128i a, __m128i b)
    95 {
    96   __m128i ret;
    97   __m128i roundconst = MC(8x0080);
    98 
    99   ret = _mm_mullo_epi16(a, b);
   100   ret = _mm_adds_epu16(ret, roundconst);
   101   ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
   102   ret = _mm_srli_epi16(ret, 8);
   103 
   104   return ret;
   105 }
   106 
   107 SSE_FUNCTION static inline __m128i
   108 muldiv_255_sse2(__m128i a, __m128i b)
   109 {
   110   __m128i alow, blow, ahigh, bhigh, low, high;
   111 
   112   alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
   113   blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
   114   ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
   115   bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
   116   low = inner_muldiv_255_sse2(alow, blow);
   117   high = inner_muldiv_255_sse2(ahigh, bhigh);
   118   return _mm_packus_epi16(low, high);
   119 }
   120 
   121 SSE_FUNCTION static inline __m128i
   122 negate_argb_sse2(__m128i a)
   123 {
   124   return _mm_xor_si128(a, MC(16xff));
   125 }
   126 
   127 SSE_FUNCTION static inline __m128i
   128 load_argb_sse2(const uint32_t *src)
   129 {
   130   return _mm_loadu_si128((__m128i *)src);
   131 }
   132 
   133 SSE_FUNCTION static inline __m128i
   134 set1_argb_sse2(uint32_t src)
   135 {
   136   return _mm_set1_epi32(src);
   137 }
   138 
   139 SSE_FUNCTION static inline __m128i
   140 load_u8_mask(const uint8_t *m)
   141 {
   142   __m128i a;
   143   a = _mm_cvtsi32_si128(*(uint32_t *)m);
   144   a = _mm_unpacklo_epi8(a, a);
   145   a = _mm_unpacklo_epi16(a, a);
   146   return a;
   147 }
   148 
   149 SSE_FUNCTION static inline __m128i
   150 set1_u8_mask(uint8_t m)
   151 {
   152   return _mm_set1_epi8(m);
   153 }
   154 
   155 SSE_FUNCTION static void
   156 store_argb_sse2(uint32_t *dest, __m128i pix)
   157 {
   158   _mm_store_si128((__m128i *)dest, pix);
   159 }
   160 
   161 SSE_FUNCTION static __m128i 
   162 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
   163 {
   164   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
   165 }
   166 
   167 SSE_FUNCTION static void
   168 composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
   169     int n)
   170 {
   171   for (; ((long)dest & 15) && (n > 0); n--) {
   172     uint32_t s = *src++;
   173     uint8_t m = *mask++;
   174 
   175     *dest++ = oil_argb(
   176 	COMPOSITE_IN(oil_argb_A(s), m),
   177 	COMPOSITE_IN(oil_argb_R(s), m),
   178 	COMPOSITE_IN(oil_argb_G(s), m),
   179 	COMPOSITE_IN(oil_argb_B(s), m));
   180   }
   181   for (; n >= 4; n -= 4) {
   182     __m128i s, m;
   183     s = load_argb_sse2(src);
   184     m = load_u8_mask(mask);
   185     store_argb_sse2(dest, muldiv_255_sse2(s, m));
   186     src += 4;
   187     mask += 4;
   188     dest += 4;
   189   }
   190   for (; n > 0; n--) {
   191     uint32_t s = *src++;
   192     uint8_t m = *mask++;
   193 
   194     *dest++ = oil_argb(
   195 	COMPOSITE_IN(oil_argb_A(s), m),
   196 	COMPOSITE_IN(oil_argb_R(s), m),
   197 	COMPOSITE_IN(oil_argb_G(s), m),
   198 	COMPOSITE_IN(oil_argb_B(s), m));
   199   }
   200 }
   201 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
   202     OIL_IMPL_FLAG_SSE2);
   203 
   204 SSE_FUNCTION static void
   205 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
   206     const uint8_t *mask, int n)
   207 {
   208   __m128i s;
   209 
   210   s = set1_argb_sse2(*src);
   211 
   212   for (; ((long)dest & 15) && (n > 0); n--) {
   213     uint8_t m = *mask++;
   214 
   215     *dest++ = oil_argb(
   216 	COMPOSITE_IN(oil_argb_A(*src), m),
   217 	COMPOSITE_IN(oil_argb_R(*src), m),
   218 	COMPOSITE_IN(oil_argb_G(*src), m),
   219 	COMPOSITE_IN(oil_argb_B(*src), m));
   220   }
   221   for (; n >= 4; n -= 4) {
   222     __m128i m;
   223     m = load_u8_mask(mask);
   224     store_argb_sse2(dest, muldiv_255_sse2(s, m));
   225     mask += 4;
   226     dest += 4;
   227   }
   228   for (; n > 0; n--) {
   229     uint8_t m = *mask++;
   230 
   231     *dest++ = oil_argb(
   232 	COMPOSITE_IN(oil_argb_A(*src), m),
   233 	COMPOSITE_IN(oil_argb_R(*src), m),
   234 	COMPOSITE_IN(oil_argb_G(*src), m),
   235 	COMPOSITE_IN(oil_argb_B(*src), m));
   236   }
   237 }
   238 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
   239     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
   240 
   241 SSE_FUNCTION static void
   242 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
   243     const uint8_t *mask, int n)
   244 {
   245   __m128i m;
   246 
   247   m = set1_u8_mask(*mask);
   248 
   249   for (; ((long)dest & 15) && (n > 0); n--) {
   250     uint32_t s = *src++;
   251 
   252     *dest++ = oil_argb(
   253 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
   254 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
   255 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
   256 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
   257   }
   258   for (; n >= 4; n -= 4) {
   259     __m128i s;
   260     s = load_argb_sse2(src);
   261     store_argb_sse2(dest,  muldiv_255_sse2(s, m));
   262     src += 4;
   263     dest += 4;
   264   }
   265   for (; n > 0; n--) {
   266     uint32_t s = *src++;
   267 
   268     *dest++ = oil_argb(
   269 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
   270 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
   271 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
   272 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
   273   }
   274 }
   275 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
   276     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
   277 
   278 SSE_FUNCTION static void
   279 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
   280 {
   281   for (; ((long)dest & 15) && (n > 0); n--) {
   282     uint32_t d = *dest, s = *src++;
   283     uint8_t srca = oil_argb_A(s);
   284     d = oil_argb(
   285 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
   286 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
   287 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
   288 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
   289     *dest++ = d;
   290   }
   291   for (; n >= 4; n -= 4) {
   292     __m128i d, s;
   293     s = load_argb_sse2(src);
   294     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
   295     store_argb_sse2(dest, d);
   296     src += 4;
   297     dest += 4;
   298   }
   299   for (; n > 0; n--) {
   300     uint32_t d = *dest, s = *src++;
   301     uint8_t srca = oil_argb_A(s);
   302     d = oil_argb(
   303 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
   304 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
   305 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
   306 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
   307     *dest++ = d;
   308   }
   309 }
   310 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
   311     OIL_IMPL_FLAG_SSE2);
   312 
   313 SSE_FUNCTION static void
   314 composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
   315 {
   316   __m128i s, sa;
   317   uint32_t srca;
   318 
   319   srca = oil_argb_A(*src);
   320   s = set1_argb_sse2(*src);
   321   sa = negate_argb_sse2(argb_A_sse2(s));
   322   for (; ((long)dest & 15) && (n > 0); n--) {
   323     uint32_t d = *dest;
   324     d = oil_argb(
   325 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
   326 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
   327 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
   328 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
   329     *dest++ = d;
   330   }
   331   for (; n >= 4; n -= 4) {
   332     __m128i d;
   333     d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
   334     store_argb_sse2(dest, d);
   335     dest += 4;
   336   }
   337   for (; n > 0; n--) {
   338     uint32_t d = *dest;
   339     d = oil_argb(
   340 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
   341 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
   342 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
   343 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
   344     *dest++ = d;
   345   }
   346 }
   347 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
   348     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
   349 
   350 SSE_FUNCTION static void
   351 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
   352     const uint8_t *mask, int n)
   353 {
   354   for (; ((long)dest & 15) && (n > 0); n--) {
   355     uint32_t d = *dest, s = *src++, m = *mask++, color;
   356     uint8_t srca;
   357 
   358     color = oil_argb(
   359         COMPOSITE_IN(oil_argb_A(s), m),
   360         COMPOSITE_IN(oil_argb_R(s), m),
   361         COMPOSITE_IN(oil_argb_G(s), m),
   362         COMPOSITE_IN(oil_argb_B(s), m));
   363     srca = oil_argb_A(color);
   364     d = oil_argb(
   365 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   366 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   367 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   368 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   369     *dest++ = d;
   370   }
   371   for (; n >= 4; n -= 4) {
   372     __m128i d, s, m;
   373     s = load_argb_sse2(src);
   374     m = load_u8_mask(mask);
   375     s = muldiv_255_sse2(s, m);
   376     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
   377     store_argb_sse2(dest, d);
   378     src += 4;
   379     mask += 4;
   380     dest += 4;
   381   }
   382   for (; n > 0; n--) {
   383     uint32_t d = *dest, s = *src++, m = *mask++, color;
   384     uint8_t srca;
   385 
   386     color = oil_argb(
   387         COMPOSITE_IN(oil_argb_A(s), m),
   388         COMPOSITE_IN(oil_argb_R(s), m),
   389         COMPOSITE_IN(oil_argb_G(s), m),
   390         COMPOSITE_IN(oil_argb_B(s), m));
   391     srca = oil_argb_A(color);
   392     d = oil_argb(
   393 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   394 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   395 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   396 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   397     *dest++ = d;
   398   }
   399 }
   400 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
   401     OIL_IMPL_FLAG_SSE2);
   402 
   403 SSE_FUNCTION static void
   404 composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
   405     const uint8_t *mask, int n)
   406 {
   407   __m128i s;
   408 
   409   s = set1_argb_sse2(*src);
   410 
   411   for (; ((long)dest & 15) && (n > 0); n--) {
   412     uint32_t d = *dest, m = *mask++, color;
   413     uint8_t srca;
   414 
   415     color = oil_argb(
   416         COMPOSITE_IN(oil_argb_A(*src), m),
   417         COMPOSITE_IN(oil_argb_R(*src), m),
   418         COMPOSITE_IN(oil_argb_G(*src), m),
   419         COMPOSITE_IN(oil_argb_B(*src), m));
   420     srca = oil_argb_A(color);
   421     d = oil_argb(
   422 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   423 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   424 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   425 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   426     *dest++ = d;
   427   }
   428   for (; n >= 4; n -= 4) {
   429     __m128i d, color, m;
   430     m = load_u8_mask(mask);
   431     color = muldiv_255_sse2(s, m);
   432     d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
   433     store_argb_sse2(dest, d);
   434     mask += 4;
   435     dest += 4;
   436   }
   437   for (; n > 0; n--) {
   438     uint32_t d = *dest, m = *mask++, color;
   439     uint8_t srca;
   440 
   441     color = oil_argb(
   442         COMPOSITE_IN(oil_argb_A(*src), m),
   443         COMPOSITE_IN(oil_argb_R(*src), m),
   444         COMPOSITE_IN(oil_argb_G(*src), m),
   445         COMPOSITE_IN(oil_argb_B(*src), m));
   446     srca = oil_argb_A(color);
   447     d = oil_argb(
   448 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   449 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   450 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   451 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   452     *dest++ = d;
   453   }
   454 }
   455 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
   456     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
   457 
   458 SSE_FUNCTION static void
   459 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
   460     const uint8_t *mask, int n)
   461 {
   462   __m128i m;
   463 
   464   m = set1_u8_mask(*mask);
   465 
   466   for (; ((long)dest & 15) && (n > 0); n--) {
   467     uint32_t d = *dest, s = *src++, color;
   468     uint8_t srca;
   469 
   470     color = oil_argb(
   471         COMPOSITE_IN(oil_argb_A(s), *mask),
   472         COMPOSITE_IN(oil_argb_R(s), *mask),
   473         COMPOSITE_IN(oil_argb_G(s), *mask),
   474         COMPOSITE_IN(oil_argb_B(s), *mask));
   475     srca = oil_argb_A(color);
   476     d = oil_argb(
   477 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   478 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   479 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   480 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   481     *dest++ = d;
   482   }
   483   for (; n >= 4; n -= 4) {
   484     __m128i d, s;
   485     s = load_argb_sse2(src);
   486     s = muldiv_255_sse2(s, m);
   487     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
   488     store_argb_sse2(dest, d);
   489     src += 4;
   490     dest += 4;
   491   }
   492   for (; n > 0; n--) {
   493     uint32_t d = *dest, s = *src++, color;
   494     uint8_t srca;
   495 
   496     color = oil_argb(
   497         COMPOSITE_IN(oil_argb_A(s), *mask),
   498         COMPOSITE_IN(oil_argb_R(s), *mask),
   499         COMPOSITE_IN(oil_argb_G(s), *mask),
   500         COMPOSITE_IN(oil_argb_B(s), *mask));
   501     srca = oil_argb_A(color);
   502     d = oil_argb(
   503 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   504 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   505 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   506 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   507     *dest++ = d;
   508   }
   509 }
   510 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
   511     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
   512 
   513 SSE_FUNCTION static void
   514 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
   515 {
   516   /* Initial operations to align the destination pointer */
   517   for (; ((long)dest & 15) && (n > 0); n--) {
   518     *dest = COMPOSITE_OVER(*dest, *src, *src);
   519     src++;
   520     dest++;
   521   }
   522   /* over_u8 can be dealt with using our argb code, with srca = s */
   523   for (; n >= 16; n -= 16) {
   524     __m128i d, s;
   525     d = *(__m128i *)dest;
   526     s = load_argb_sse2((uint32_t *)src);
   527     store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
   528     src += 16;
   529     dest += 16;
   530   }
   531   for (; n > 0; n--) {
   532     *dest = COMPOSITE_OVER(*dest, *src, *src);
   533     src++;
   534     dest++;
   535   }
   536 }
   537 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,
   538     OIL_IMPL_FLAG_SSE2);
   539 
   540 #endif
   541