1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/composite_sse_4pix.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,541 @@
1.4 +/*
1.5 + * Copyright (c) 2005
1.6 + * Eric Anholt. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +
1.30 +#ifdef HAVE_CONFIG_H
1.31 +#include "config.h"
1.32 +#endif
1.33 +#include <liboilclasses.h>
1.34 +#include <liboilfunction.h>
1.35 +#include <emmintrin.h>
1.36 +#include "liboil/liboilcolorspace.h"
1.37 +
1.38 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
1.39 +
1.40 +#ifdef ENABLE_BROKEN_IMPLS
1.41 +
1.42 +union m128_int {
1.43 + __m128i m128;
1.44 + uint64_t ull[2];
1.45 +};
1.46 +
1.47 +static const struct _SSEData {
1.48 + union m128_int sse_16xff;
1.49 + union m128_int sse_8x0080;
1.50 +} c = {
1.51 + .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL},
1.52 + .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
1.53 +};
1.54 +
1.55 +#define MC(x) (c.sse_##x.m128)
1.56 +
1.57 +/* non-SSE2 compositing support */
1.58 +#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
1.59 +#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
1.60 +#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
1.61 +
1.62 +/* This SSE2 code is based around operations on four pixels at a time. The
1.63 + * exception is muldiv_255_sse2, which needs to expand the four pixels into
1.64 + * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
1.65 + * the appropriate rounding on division.
1.66 + */
1.67 +
1.68 +/* Shuffles the given value such that the alpha for each pixel appears in each
1.69 + * channel of the pixel.
1.70 + */
1.71 +SSE_FUNCTION static inline __m128i
1.72 +argb_A_sse2(__m128i a)
1.73 +{
1.74 +#if 0
1.75 + /* Shift the alpha channel of each pixel into the low byte */
1.76 + a = _mm_srli_epi32(a, 24);
1.77 + /* Now, shift and or so we can get it into all the channels */
1.78 + a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
1.79 + a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
1.80 + return a;
1.81 +#else
1.82 + /* Move the alpha channel into the low byte */
1.83 + a = _mm_srli_epi32(a, 24);
1.84 + /* Pack our four alpha channels down into the lower 32 bits */
1.85 + a = _mm_packus_epi16(a, _mm_setzero_si128());
1.86 + a = _mm_packus_epi16(a, _mm_setzero_si128());
1.87 + /* And expand it back out into four pixels of all channels the same */
1.88 + a = _mm_unpacklo_epi8(a, a);
1.89 + return _mm_unpacklo_epi16(a, a);
1.90 +#endif
1.91 +}
1.92 +
1.93 +/* Multiplies the unpacked 16-bits-per-channel pixel data in a
1.94 + * channel-by-channel by b, and divides the result by 255, with rounding.
1.95 + */
1.96 +SSE_FUNCTION static inline __m128i
1.97 +inner_muldiv_255_sse2(__m128i a, __m128i b)
1.98 +{
1.99 + __m128i ret;
1.100 + __m128i roundconst = MC(8x0080);
1.101 +
1.102 + ret = _mm_mullo_epi16(a, b);
1.103 + ret = _mm_adds_epu16(ret, roundconst);
1.104 + ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
1.105 + ret = _mm_srli_epi16(ret, 8);
1.106 +
1.107 + return ret;
1.108 +}
1.109 +
1.110 +SSE_FUNCTION static inline __m128i
1.111 +muldiv_255_sse2(__m128i a, __m128i b)
1.112 +{
1.113 + __m128i alow, blow, ahigh, bhigh, low, high;
1.114 +
1.115 + alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
1.116 + blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
1.117 + ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
1.118 + bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
1.119 + low = inner_muldiv_255_sse2(alow, blow);
1.120 + high = inner_muldiv_255_sse2(ahigh, bhigh);
1.121 + return _mm_packus_epi16(low, high);
1.122 +}
1.123 +
1.124 +SSE_FUNCTION static inline __m128i
1.125 +negate_argb_sse2(__m128i a)
1.126 +{
1.127 + return _mm_xor_si128(a, MC(16xff));
1.128 +}
1.129 +
1.130 +SSE_FUNCTION static inline __m128i
1.131 +load_argb_sse2(const uint32_t *src)
1.132 +{
1.133 + return _mm_loadu_si128((__m128i *)src);
1.134 +}
1.135 +
1.136 +SSE_FUNCTION static inline __m128i
1.137 +set1_argb_sse2(uint32_t src)
1.138 +{
1.139 + return _mm_set1_epi32(src);
1.140 +}
1.141 +
1.142 +SSE_FUNCTION static inline __m128i
1.143 +load_u8_mask(const uint8_t *m)
1.144 +{
1.145 + __m128i a;
1.146 + a = _mm_cvtsi32_si128(*(uint32_t *)m);
1.147 + a = _mm_unpacklo_epi8(a, a);
1.148 + a = _mm_unpacklo_epi16(a, a);
1.149 + return a;
1.150 +}
1.151 +
1.152 +SSE_FUNCTION static inline __m128i
1.153 +set1_u8_mask(uint8_t m)
1.154 +{
1.155 + return _mm_set1_epi8(m);
1.156 +}
1.157 +
1.158 +SSE_FUNCTION static void
1.159 +store_argb_sse2(uint32_t *dest, __m128i pix)
1.160 +{
1.161 + _mm_store_si128((__m128i *)dest, pix);
1.162 +}
1.163 +
1.164 +SSE_FUNCTION static __m128i
1.165 +over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
1.166 +{
1.167 + return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
1.168 +}
1.169 +
1.170 +SSE_FUNCTION static void
1.171 +composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
1.172 + int n)
1.173 +{
1.174 + for (; ((long)dest & 15) && (n > 0); n--) {
1.175 + uint32_t s = *src++;
1.176 + uint8_t m = *mask++;
1.177 +
1.178 + *dest++ = oil_argb(
1.179 + COMPOSITE_IN(oil_argb_A(s), m),
1.180 + COMPOSITE_IN(oil_argb_R(s), m),
1.181 + COMPOSITE_IN(oil_argb_G(s), m),
1.182 + COMPOSITE_IN(oil_argb_B(s), m));
1.183 + }
1.184 + for (; n >= 4; n -= 4) {
1.185 + __m128i s, m;
1.186 + s = load_argb_sse2(src);
1.187 + m = load_u8_mask(mask);
1.188 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.189 + src += 4;
1.190 + mask += 4;
1.191 + dest += 4;
1.192 + }
1.193 + for (; n > 0; n--) {
1.194 + uint32_t s = *src++;
1.195 + uint8_t m = *mask++;
1.196 +
1.197 + *dest++ = oil_argb(
1.198 + COMPOSITE_IN(oil_argb_A(s), m),
1.199 + COMPOSITE_IN(oil_argb_R(s), m),
1.200 + COMPOSITE_IN(oil_argb_G(s), m),
1.201 + COMPOSITE_IN(oil_argb_B(s), m));
1.202 + }
1.203 +}
1.204 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
1.205 + OIL_IMPL_FLAG_SSE2);
1.206 +
1.207 +SSE_FUNCTION static void
1.208 +composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
1.209 + const uint8_t *mask, int n)
1.210 +{
1.211 + __m128i s;
1.212 +
1.213 + s = set1_argb_sse2(*src);
1.214 +
1.215 + for (; ((long)dest & 15) && (n > 0); n--) {
1.216 + uint8_t m = *mask++;
1.217 +
1.218 + *dest++ = oil_argb(
1.219 + COMPOSITE_IN(oil_argb_A(*src), m),
1.220 + COMPOSITE_IN(oil_argb_R(*src), m),
1.221 + COMPOSITE_IN(oil_argb_G(*src), m),
1.222 + COMPOSITE_IN(oil_argb_B(*src), m));
1.223 + }
1.224 + for (; n >= 4; n -= 4) {
1.225 + __m128i m;
1.226 + m = load_u8_mask(mask);
1.227 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.228 + mask += 4;
1.229 + dest += 4;
1.230 + }
1.231 + for (; n > 0; n--) {
1.232 + uint8_t m = *mask++;
1.233 +
1.234 + *dest++ = oil_argb(
1.235 + COMPOSITE_IN(oil_argb_A(*src), m),
1.236 + COMPOSITE_IN(oil_argb_R(*src), m),
1.237 + COMPOSITE_IN(oil_argb_G(*src), m),
1.238 + COMPOSITE_IN(oil_argb_B(*src), m));
1.239 + }
1.240 +}
1.241 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
1.242 + composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.243 +
1.244 +SSE_FUNCTION static void
1.245 +composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
1.246 + const uint8_t *mask, int n)
1.247 +{
1.248 + __m128i m;
1.249 +
1.250 + m = set1_u8_mask(*mask);
1.251 +
1.252 + for (; ((long)dest & 15) && (n > 0); n--) {
1.253 + uint32_t s = *src++;
1.254 +
1.255 + *dest++ = oil_argb(
1.256 + COMPOSITE_IN(oil_argb_A(s), mask[0]),
1.257 + COMPOSITE_IN(oil_argb_R(s), mask[0]),
1.258 + COMPOSITE_IN(oil_argb_G(s), mask[0]),
1.259 + COMPOSITE_IN(oil_argb_B(s), mask[0]));
1.260 + }
1.261 + for (; n >= 4; n -= 4) {
1.262 + __m128i s;
1.263 + s = load_argb_sse2(src);
1.264 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.265 + src += 4;
1.266 + dest += 4;
1.267 + }
1.268 + for (; n > 0; n--) {
1.269 + uint32_t s = *src++;
1.270 +
1.271 + *dest++ = oil_argb(
1.272 + COMPOSITE_IN(oil_argb_A(s), mask[0]),
1.273 + COMPOSITE_IN(oil_argb_R(s), mask[0]),
1.274 + COMPOSITE_IN(oil_argb_G(s), mask[0]),
1.275 + COMPOSITE_IN(oil_argb_B(s), mask[0]));
1.276 + }
1.277 +}
1.278 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
1.279 + composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
1.280 +
1.281 +SSE_FUNCTION static void
1.282 +composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
1.283 +{
1.284 + for (; ((long)dest & 15) && (n > 0); n--) {
1.285 + uint32_t d = *dest, s = *src++;
1.286 + uint8_t srca = oil_argb_A(s);
1.287 + d = oil_argb(
1.288 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
1.289 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
1.290 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
1.291 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
1.292 + *dest++ = d;
1.293 + }
1.294 + for (; n >= 4; n -= 4) {
1.295 + __m128i d, s;
1.296 + s = load_argb_sse2(src);
1.297 + d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
1.298 + store_argb_sse2(dest, d);
1.299 + src += 4;
1.300 + dest += 4;
1.301 + }
1.302 + for (; n > 0; n--) {
1.303 + uint32_t d = *dest, s = *src++;
1.304 + uint8_t srca = oil_argb_A(s);
1.305 + d = oil_argb(
1.306 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
1.307 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
1.308 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
1.309 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
1.310 + *dest++ = d;
1.311 + }
1.312 +}
1.313 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
1.314 + OIL_IMPL_FLAG_SSE2);
1.315 +
1.316 +SSE_FUNCTION static void
1.317 +composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
1.318 +{
1.319 + __m128i s, sa;
1.320 + uint32_t srca;
1.321 +
1.322 + srca = oil_argb_A(*src);
1.323 + s = set1_argb_sse2(*src);
1.324 + sa = negate_argb_sse2(argb_A_sse2(s));
1.325 + for (; ((long)dest & 15) && (n > 0); n--) {
1.326 + uint32_t d = *dest;
1.327 + d = oil_argb(
1.328 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
1.329 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
1.330 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
1.331 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
1.332 + *dest++ = d;
1.333 + }
1.334 + for (; n >= 4; n -= 4) {
1.335 + __m128i d;
1.336 + d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
1.337 + store_argb_sse2(dest, d);
1.338 + dest += 4;
1.339 + }
1.340 + for (; n > 0; n--) {
1.341 + uint32_t d = *dest;
1.342 + d = oil_argb(
1.343 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
1.344 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
1.345 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
1.346 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
1.347 + *dest++ = d;
1.348 + }
1.349 +}
1.350 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
1.351 + composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.352 +
1.353 +SSE_FUNCTION static void
1.354 +composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
1.355 + const uint8_t *mask, int n)
1.356 +{
1.357 + for (; ((long)dest & 15) && (n > 0); n--) {
1.358 + uint32_t d = *dest, s = *src++, m = *mask++, color;
1.359 + uint8_t srca;
1.360 +
1.361 + color = oil_argb(
1.362 + COMPOSITE_IN(oil_argb_A(s), m),
1.363 + COMPOSITE_IN(oil_argb_R(s), m),
1.364 + COMPOSITE_IN(oil_argb_G(s), m),
1.365 + COMPOSITE_IN(oil_argb_B(s), m));
1.366 + srca = oil_argb_A(color);
1.367 + d = oil_argb(
1.368 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.369 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.370 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.371 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.372 + *dest++ = d;
1.373 + }
1.374 + for (; n >= 4; n -= 4) {
1.375 + __m128i d, s, m;
1.376 + s = load_argb_sse2(src);
1.377 + m = load_u8_mask(mask);
1.378 + s = muldiv_255_sse2(s, m);
1.379 + d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
1.380 + store_argb_sse2(dest, d);
1.381 + src += 4;
1.382 + mask += 4;
1.383 + dest += 4;
1.384 + }
1.385 + for (; n > 0; n--) {
1.386 + uint32_t d = *dest, s = *src++, m = *mask++, color;
1.387 + uint8_t srca;
1.388 +
1.389 + color = oil_argb(
1.390 + COMPOSITE_IN(oil_argb_A(s), m),
1.391 + COMPOSITE_IN(oil_argb_R(s), m),
1.392 + COMPOSITE_IN(oil_argb_G(s), m),
1.393 + COMPOSITE_IN(oil_argb_B(s), m));
1.394 + srca = oil_argb_A(color);
1.395 + d = oil_argb(
1.396 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.397 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.398 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.399 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.400 + *dest++ = d;
1.401 + }
1.402 +}
1.403 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
1.404 + OIL_IMPL_FLAG_SSE2);
1.405 +
1.406 +SSE_FUNCTION static void
1.407 +composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
1.408 + const uint8_t *mask, int n)
1.409 +{
1.410 + __m128i s;
1.411 +
1.412 + s = set1_argb_sse2(*src);
1.413 +
1.414 + for (; ((long)dest & 15) && (n > 0); n--) {
1.415 + uint32_t d = *dest, m = *mask++, color;
1.416 + uint8_t srca;
1.417 +
1.418 + color = oil_argb(
1.419 + COMPOSITE_IN(oil_argb_A(*src), m),
1.420 + COMPOSITE_IN(oil_argb_R(*src), m),
1.421 + COMPOSITE_IN(oil_argb_G(*src), m),
1.422 + COMPOSITE_IN(oil_argb_B(*src), m));
1.423 + srca = oil_argb_A(color);
1.424 + d = oil_argb(
1.425 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.426 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.427 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.428 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.429 + *dest++ = d;
1.430 + }
1.431 + for (; n >= 4; n -= 4) {
1.432 + __m128i d, color, m;
1.433 + m = load_u8_mask(mask);
1.434 + color = muldiv_255_sse2(s, m);
1.435 + d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
1.436 + store_argb_sse2(dest, d);
1.437 + mask += 4;
1.438 + dest += 4;
1.439 + }
1.440 + for (; n > 0; n--) {
1.441 + uint32_t d = *dest, m = *mask++, color;
1.442 + uint8_t srca;
1.443 +
1.444 + color = oil_argb(
1.445 + COMPOSITE_IN(oil_argb_A(*src), m),
1.446 + COMPOSITE_IN(oil_argb_R(*src), m),
1.447 + COMPOSITE_IN(oil_argb_G(*src), m),
1.448 + COMPOSITE_IN(oil_argb_B(*src), m));
1.449 + srca = oil_argb_A(color);
1.450 + d = oil_argb(
1.451 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.452 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.453 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.454 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.455 + *dest++ = d;
1.456 + }
1.457 +}
1.458 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
1.459 + composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.460 +
1.461 +SSE_FUNCTION static void
1.462 +composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
1.463 + const uint8_t *mask, int n)
1.464 +{
1.465 + __m128i m;
1.466 +
1.467 + m = set1_u8_mask(*mask);
1.468 +
1.469 + for (; ((long)dest & 15) && (n > 0); n--) {
1.470 + uint32_t d = *dest, s = *src++, color;
1.471 + uint8_t srca;
1.472 +
1.473 + color = oil_argb(
1.474 + COMPOSITE_IN(oil_argb_A(s), *mask),
1.475 + COMPOSITE_IN(oil_argb_R(s), *mask),
1.476 + COMPOSITE_IN(oil_argb_G(s), *mask),
1.477 + COMPOSITE_IN(oil_argb_B(s), *mask));
1.478 + srca = oil_argb_A(color);
1.479 + d = oil_argb(
1.480 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.481 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.482 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.483 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.484 + *dest++ = d;
1.485 + }
1.486 + for (; n >= 4; n -= 4) {
1.487 + __m128i d, s;
1.488 + s = load_argb_sse2(src);
1.489 + s = muldiv_255_sse2(s, m);
1.490 + d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
1.491 + store_argb_sse2(dest, d);
1.492 + src += 4;
1.493 + dest += 4;
1.494 + }
1.495 + for (; n > 0; n--) {
1.496 + uint32_t d = *dest, s = *src++, color;
1.497 + uint8_t srca;
1.498 +
1.499 + color = oil_argb(
1.500 + COMPOSITE_IN(oil_argb_A(s), *mask),
1.501 + COMPOSITE_IN(oil_argb_R(s), *mask),
1.502 + COMPOSITE_IN(oil_argb_G(s), *mask),
1.503 + COMPOSITE_IN(oil_argb_B(s), *mask));
1.504 + srca = oil_argb_A(color);
1.505 + d = oil_argb(
1.506 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.507 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.508 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.509 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.510 + *dest++ = d;
1.511 + }
1.512 +}
1.513 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
1.514 + composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
1.515 +
1.516 +SSE_FUNCTION static void
1.517 +composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
1.518 +{
1.519 + /* Initial operations to align the destination pointer */
1.520 + for (; ((long)dest & 15) && (n > 0); n--) {
1.521 + *dest = COMPOSITE_OVER(*dest, *src, *src);
1.522 + src++;
1.523 + dest++;
1.524 + }
1.525 + /* over_u8 can be dealt with using our argb code, with srca = s */
1.526 + for (; n >= 16; n -= 16) {
1.527 + __m128i d, s;
1.528 + d = *(__m128i *)dest;
1.529 + s = load_argb_sse2((uint32_t *)src);
1.530 + store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
1.531 + src += 16;
1.532 + dest += 16;
1.533 + }
1.534 + for (; n > 0; n--) {
1.535 + *dest = COMPOSITE_OVER(*dest, *src, *src);
1.536 + src++;
1.537 + dest++;
1.538 + }
1.539 +}
1.540 +OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,
1.541 + OIL_IMPL_FLAG_SSE2);
1.542 +
1.543 +#endif
1.544 +