sl@0: /* sl@0: * Copyright (c) 2005 sl@0: * Eric Anholt. All rights reserved. sl@0: * sl@0: * Redistribution and use in source and binary forms, with or without sl@0: * modification, are permitted provided that the following conditions sl@0: * are met: sl@0: * 1. Redistributions of source code must retain the above copyright sl@0: * notice, this list of conditions and the following disclaimer. sl@0: * 2. Redistributions in binary form must reproduce the above copyright sl@0: * notice, this list of conditions and the following disclaimer in the sl@0: * documentation and/or other materials provided with the distribution. sl@0: * sl@0: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND sl@0: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE sl@0: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE sl@0: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE sl@0: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL sl@0: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS sl@0: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) sl@0: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT sl@0: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY sl@0: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF sl@0: * SUCH DAMAGE. sl@0: */ sl@0: sl@0: #ifdef HAVE_CONFIG_H sl@0: #include "config.h" sl@0: #endif sl@0: #include sl@0: #include sl@0: #include sl@0: #include "liboil/liboilcolorspace.h" sl@0: sl@0: #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) sl@0: sl@0: #ifdef ENABLE_BROKEN_IMPLS sl@0: sl@0: union m128_int { sl@0: __m128i m128; sl@0: uint64_t ull[2]; sl@0: }; sl@0: sl@0: static const struct _SSEData { sl@0: union m128_int sse_16xff; sl@0: union m128_int sse_8x0080; sl@0: } c = { sl@0: .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL}, sl@0: .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, sl@0: }; sl@0: sl@0: #define MC(x) (c.sse_##x.m128) sl@0: sl@0: /* non-SSE2 compositing support */ sl@0: #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) sl@0: #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) sl@0: #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m)) sl@0: sl@0: /* This SSE2 code is based around operations on four pixels at a time. The sl@0: * exception is muldiv_255_sse2, which needs to expand the four pixels into sl@0: * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing sl@0: * the appropriate rounding on division. sl@0: */ sl@0: sl@0: /* Shuffles the given value such that the alpha for each pixel appears in each sl@0: * channel of the pixel. sl@0: */ sl@0: SSE_FUNCTION static inline __m128i sl@0: argb_A_sse2(__m128i a) sl@0: { sl@0: #if 0 sl@0: /* Shift the alpha channel of each pixel into the low byte */ sl@0: a = _mm_srli_epi32(a, 24); sl@0: /* Now, shift and or so we can get it into all the channels */ sl@0: a = _mm_or_si128(a, _mm_slli_epi32(a, 8)); sl@0: a = _mm_or_si128(a, _mm_slli_epi32(a, 16)); sl@0: return a; sl@0: #else sl@0: /* Move the alpha channel into the low byte */ sl@0: a = _mm_srli_epi32(a, 24); sl@0: /* Pack our four alpha channels down into the lower 32 bits */ sl@0: a = _mm_packus_epi16(a, _mm_setzero_si128()); sl@0: a = _mm_packus_epi16(a, _mm_setzero_si128()); sl@0: /* And expand it back out into four pixels of all channels the same */ sl@0: a = _mm_unpacklo_epi8(a, a); sl@0: return _mm_unpacklo_epi16(a, a); sl@0: #endif sl@0: } sl@0: sl@0: /* Multiplies the unpacked 16-bits-per-channel pixel data in a sl@0: * channel-by-channel by b, and divides the result by 255, with rounding. sl@0: */ sl@0: SSE_FUNCTION static inline __m128i sl@0: inner_muldiv_255_sse2(__m128i a, __m128i b) sl@0: { sl@0: __m128i ret; sl@0: __m128i roundconst = MC(8x0080); sl@0: sl@0: ret = _mm_mullo_epi16(a, b); sl@0: ret = _mm_adds_epu16(ret, roundconst); sl@0: ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8)); sl@0: ret = _mm_srli_epi16(ret, 8); sl@0: sl@0: return ret; sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: muldiv_255_sse2(__m128i a, __m128i b) sl@0: { sl@0: __m128i alow, blow, ahigh, bhigh, low, high; sl@0: sl@0: alow = _mm_unpacklo_epi8(a, _mm_setzero_si128()); sl@0: blow = _mm_unpacklo_epi8(b, _mm_setzero_si128()); sl@0: ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128()); sl@0: bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128()); sl@0: low = inner_muldiv_255_sse2(alow, blow); sl@0: high = inner_muldiv_255_sse2(ahigh, bhigh); sl@0: return _mm_packus_epi16(low, high); sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: negate_argb_sse2(__m128i a) sl@0: { sl@0: return _mm_xor_si128(a, MC(16xff)); sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: load_argb_sse2(const uint32_t *src) sl@0: { sl@0: return _mm_loadu_si128((__m128i *)src); sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: set1_argb_sse2(uint32_t src) sl@0: { sl@0: return _mm_set1_epi32(src); sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: load_u8_mask(const uint8_t *m) sl@0: { sl@0: __m128i a; sl@0: a = _mm_cvtsi32_si128(*(uint32_t *)m); sl@0: a = _mm_unpacklo_epi8(a, a); sl@0: a = _mm_unpacklo_epi16(a, a); sl@0: return a; sl@0: } sl@0: sl@0: SSE_FUNCTION static inline __m128i sl@0: set1_u8_mask(uint8_t m) sl@0: { sl@0: return _mm_set1_epi8(m); sl@0: } sl@0: sl@0: SSE_FUNCTION static void sl@0: store_argb_sse2(uint32_t *dest, __m128i pix) sl@0: { sl@0: _mm_store_si128((__m128i *)dest, pix); sl@0: } sl@0: sl@0: SSE_FUNCTION static __m128i sl@0: over_argb_sse2(__m128i dest, __m128i src, __m128i srca) sl@0: { sl@0: return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca))); sl@0: } sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, sl@0: int n) sl@0: { sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t s = *src++; sl@0: uint8_t m = *mask++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), m), sl@0: COMPOSITE_IN(oil_argb_R(s), m), sl@0: COMPOSITE_IN(oil_argb_G(s), m), sl@0: COMPOSITE_IN(oil_argb_B(s), m)); sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i s, m; sl@0: s = load_argb_sse2(src); sl@0: m = load_u8_mask(mask); sl@0: store_argb_sse2(dest, muldiv_255_sse2(s, m)); sl@0: src += 4; sl@0: mask += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t s = *src++; sl@0: uint8_t m = *mask++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), m), sl@0: COMPOSITE_IN(oil_argb_R(s), m), sl@0: COMPOSITE_IN(oil_argb_G(s), m), sl@0: COMPOSITE_IN(oil_argb_B(s), m)); sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb, sl@0: OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src, sl@0: const uint8_t *mask, int n) sl@0: { sl@0: __m128i s; sl@0: sl@0: s = set1_argb_sse2(*src); sl@0: sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint8_t m = *mask++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(*src), m), sl@0: COMPOSITE_IN(oil_argb_R(*src), m), sl@0: COMPOSITE_IN(oil_argb_G(*src), m), sl@0: COMPOSITE_IN(oil_argb_B(*src), m)); sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i m; sl@0: m = load_u8_mask(mask); sl@0: store_argb_sse2(dest, muldiv_255_sse2(s, m)); sl@0: mask += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint8_t m = *mask++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(*src), m), sl@0: COMPOSITE_IN(oil_argb_R(*src), m), sl@0: COMPOSITE_IN(oil_argb_G(*src), m), sl@0: COMPOSITE_IN(oil_argb_B(*src), m)); sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse, sl@0: composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, sl@0: const uint8_t *mask, int n) sl@0: { sl@0: __m128i m; sl@0: sl@0: m = set1_u8_mask(*mask); sl@0: sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t s = *src++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_R(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_G(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_B(s), mask[0])); sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i s; sl@0: s = load_argb_sse2(src); sl@0: store_argb_sse2(dest, muldiv_255_sse2(s, m)); sl@0: src += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t s = *src++; sl@0: sl@0: *dest++ = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_R(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_G(s), mask[0]), sl@0: COMPOSITE_IN(oil_argb_B(s), mask[0])); sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse, sl@0: composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n) sl@0: { sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t d = *dest, s = *src++; sl@0: uint8_t srca = oil_argb_A(s); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca)); sl@0: *dest++ = d; sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i d, s; sl@0: s = load_argb_sse2(src); sl@0: d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s)); sl@0: store_argb_sse2(dest, d); sl@0: src += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t d = *dest, s = *src++; sl@0: uint8_t srca = oil_argb_A(s); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca)); sl@0: *dest++ = d; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb, sl@0: OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n) sl@0: { sl@0: __m128i s, sa; sl@0: uint32_t srca; sl@0: sl@0: srca = oil_argb_A(*src); sl@0: s = set1_argb_sse2(*src); sl@0: sa = negate_argb_sse2(argb_A_sse2(s)); sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t d = *dest; sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca)); sl@0: *dest++ = d; sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i d; sl@0: d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa)); sl@0: store_argb_sse2(dest, d); sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t d = *dest; sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca)); sl@0: *dest++ = d; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse, sl@0: composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src, sl@0: const uint8_t *mask, int n) sl@0: { sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t d = *dest, s = *src++, m = *mask++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), m), sl@0: COMPOSITE_IN(oil_argb_R(s), m), sl@0: COMPOSITE_IN(oil_argb_G(s), m), sl@0: COMPOSITE_IN(oil_argb_B(s), m)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i d, s, m; sl@0: s = load_argb_sse2(src); sl@0: m = load_u8_mask(mask); sl@0: s = muldiv_255_sse2(s, m); sl@0: d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s)); sl@0: store_argb_sse2(dest, d); sl@0: src += 4; sl@0: mask += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t d = *dest, s = *src++, m = *mask++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), m), sl@0: COMPOSITE_IN(oil_argb_R(s), m), sl@0: COMPOSITE_IN(oil_argb_G(s), m), sl@0: COMPOSITE_IN(oil_argb_B(s), m)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb, sl@0: OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, sl@0: const uint8_t *mask, int n) sl@0: { sl@0: __m128i s; sl@0: sl@0: s = set1_argb_sse2(*src); sl@0: sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t d = *dest, m = *mask++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(*src), m), sl@0: COMPOSITE_IN(oil_argb_R(*src), m), sl@0: COMPOSITE_IN(oil_argb_G(*src), m), sl@0: COMPOSITE_IN(oil_argb_B(*src), m)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i d, color, m; sl@0: m = load_u8_mask(mask); sl@0: color = muldiv_255_sse2(s, m); sl@0: d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color)); sl@0: store_argb_sse2(dest, d); sl@0: mask += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t d = *dest, m = *mask++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(*src), m), sl@0: COMPOSITE_IN(oil_argb_R(*src), m), sl@0: COMPOSITE_IN(oil_argb_G(*src), m), sl@0: COMPOSITE_IN(oil_argb_B(*src), m)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse, sl@0: composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, sl@0: const uint8_t *mask, int n) sl@0: { sl@0: __m128i m; sl@0: sl@0: m = set1_u8_mask(*mask); sl@0: sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: uint32_t d = *dest, s = *src++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), *mask), sl@0: COMPOSITE_IN(oil_argb_R(s), *mask), sl@0: COMPOSITE_IN(oil_argb_G(s), *mask), sl@0: COMPOSITE_IN(oil_argb_B(s), *mask)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: for (; n >= 4; n -= 4) { sl@0: __m128i d, s; sl@0: s = load_argb_sse2(src); sl@0: s = muldiv_255_sse2(s, m); sl@0: d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s)); sl@0: store_argb_sse2(dest, d); sl@0: src += 4; sl@0: dest += 4; sl@0: } sl@0: for (; n > 0; n--) { sl@0: uint32_t d = *dest, s = *src++, color; sl@0: uint8_t srca; sl@0: sl@0: color = oil_argb( sl@0: COMPOSITE_IN(oil_argb_A(s), *mask), sl@0: COMPOSITE_IN(oil_argb_R(s), *mask), sl@0: COMPOSITE_IN(oil_argb_G(s), *mask), sl@0: COMPOSITE_IN(oil_argb_B(s), *mask)); sl@0: srca = oil_argb_A(color); sl@0: d = oil_argb( sl@0: COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), sl@0: COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), sl@0: COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), sl@0: COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); sl@0: *dest++ = d; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse, sl@0: composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); sl@0: sl@0: SSE_FUNCTION static void sl@0: composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n) sl@0: { sl@0: /* Initial operations to align the destination pointer */ sl@0: for (; ((long)dest & 15) && (n > 0); n--) { sl@0: *dest = COMPOSITE_OVER(*dest, *src, *src); sl@0: src++; sl@0: dest++; sl@0: } sl@0: /* over_u8 can be dealt with using our argb code, with srca = s */ sl@0: for (; n >= 16; n -= 16) { sl@0: __m128i d, s; sl@0: d = *(__m128i *)dest; sl@0: s = load_argb_sse2((uint32_t *)src); sl@0: store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s)); sl@0: src += 16; sl@0: dest += 16; sl@0: } sl@0: for (; n > 0; n--) { sl@0: *dest = COMPOSITE_OVER(*dest, *src, *src); sl@0: src++; sl@0: dest++; sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8, sl@0: OIL_IMPL_FLAG_SSE2); sl@0: sl@0: #endif sl@0: