Update contrib.
3 * Eric Anholt. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <liboilclasses.h>
31 #include <liboilfunction.h>
32 #include <emmintrin.h>
33 #include "liboil/liboilcolorspace.h"
35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 /* non-SSE2 compositing support */
38 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
39 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
40 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
42 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
43 * the channel value in the low byte. This means 2 pixels per pass.
46 #ifdef ENABLE_BROKEN_IMPLS
53 static const struct _SSEData {
54 union m128_int sse_8x00ff;
55 union m128_int sse_8x0080;
57 .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
58 .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
61 #define MC(x) (c.sse_##x.m128)
63 /* Shuffles the given value such that the alpha for each pixel appears in each
64 * channel of the pixel.
66 SSE_FUNCTION static inline __m128i
67 argb_A_sse2(__m128i a)
69 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
70 a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
74 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
75 * result by 255, with rounding.
77 SSE_FUNCTION static inline __m128i
78 muldiv_255_sse2(__m128i a, __m128i b)
81 __m128i roundconst = MC(8x0080);
83 ret = _mm_mullo_epi16(a, b);
84 ret = _mm_adds_epu16(ret, roundconst);
85 ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
86 ret = _mm_srli_epi16(ret, 8);
91 SSE_FUNCTION static inline __m128i
92 negate_argb_sse2(__m128i a)
94 return _mm_xor_si128(a, MC(8x00ff));
97 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
98 SSE_FUNCTION static inline __m128i
99 load_argb_sse2(const uint32_t *src)
103 pix = _mm_loadl_epi64((__m128i *)src);
104 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
108 SSE_FUNCTION static inline __m128i
109 set1_argb_sse2(uint32_t src)
113 pix = _mm_set1_epi32(src);
114 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
118 SSE_FUNCTION static inline __m128i
119 load_u8_mask(const uint8_t *m)
121 return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
124 SSE_FUNCTION static inline __m128i
125 set1_u8_mask(uint8_t m)
127 return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
130 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
131 SSE_FUNCTION static void
132 store_argb_sse2(uint32_t *dest, __m128i pix)
134 pix = _mm_packus_epi16(pix, pix);
135 _mm_storel_epi64((__m128i *)dest, pix);
138 SSE_FUNCTION static __m128i
139 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
141 return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
144 SSE_FUNCTION static void
145 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
146 const uint8_t *mask, int n)
148 for (; n >= 2; n -= 2) {
150 s = load_argb_sse2(src);
151 m = load_u8_mask(mask);
152 store_argb_sse2(dest, muldiv_255_sse2(s, m));
162 COMPOSITE_IN(oil_argb_A(s), m),
163 COMPOSITE_IN(oil_argb_R(s), m),
164 COMPOSITE_IN(oil_argb_G(s), m),
165 COMPOSITE_IN(oil_argb_B(s), m));
168 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
171 SSE_FUNCTION static void
172 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
173 const uint8_t *mask, int n)
177 s = set1_argb_sse2(*src);
179 for (; n >= 2; n -= 2) {
181 m = load_u8_mask(mask);
182 store_argb_sse2(dest, muldiv_255_sse2(s, m));
190 COMPOSITE_IN(oil_argb_A(*src), m),
191 COMPOSITE_IN(oil_argb_R(*src), m),
192 COMPOSITE_IN(oil_argb_G(*src), m),
193 COMPOSITE_IN(oil_argb_B(*src), m));
196 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
197 composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
200 SSE_FUNCTION static void
201 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
202 const uint8_t *mask, int n)
206 m = set1_u8_mask(*mask);
208 for (; n >= 2; n -= 2) {
210 s = load_argb_sse2(src);
211 store_argb_sse2(dest, muldiv_255_sse2(s, m));
219 COMPOSITE_IN(oil_argb_A(s), mask[0]),
220 COMPOSITE_IN(oil_argb_R(s), mask[0]),
221 COMPOSITE_IN(oil_argb_G(s), mask[0]),
222 COMPOSITE_IN(oil_argb_B(s), mask[0]));
225 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
226 composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
229 SSE_FUNCTION static void
230 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
232 for (; n >= 2; n -= 2) {
234 s = load_argb_sse2(src);
235 d = load_argb_sse2(dest);
236 d = over_argb_sse2(d, s, argb_A_sse2(s));
237 store_argb_sse2(dest, d);
242 uint32_t d = *dest, s = *src++;
243 uint8_t srca = oil_argb_A(s);
245 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
246 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
247 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
248 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
252 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
255 SSE_FUNCTION static void
256 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
262 srca = oil_argb_A(*src);
263 s = set1_argb_sse2(*src);
264 sa = negate_argb_sse2(argb_A_sse2(s));
265 for (; n >= 2; n -= 2) {
267 d = load_argb_sse2(dest);
268 d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
269 store_argb_sse2(dest, d);
275 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
276 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
277 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
278 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
282 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
283 composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
285 SSE_FUNCTION static void
286 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
287 const uint8_t *mask, int n)
289 for (; n >= 2; n -= 2) {
291 s = load_argb_sse2(src);
292 m = load_u8_mask(mask);
293 d = load_argb_sse2(dest);
294 s = muldiv_255_sse2(s, m);
295 d = over_argb_sse2(d, s, argb_A_sse2(s));
296 store_argb_sse2(dest, d);
302 uint32_t d = *dest, s = *src++, m = *mask++, color;
306 COMPOSITE_IN(oil_argb_A(s), m),
307 COMPOSITE_IN(oil_argb_R(s), m),
308 COMPOSITE_IN(oil_argb_G(s), m),
309 COMPOSITE_IN(oil_argb_B(s), m));
310 srca = oil_argb_A(color);
312 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
313 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
314 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
315 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
319 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
322 SSE_FUNCTION static void
323 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
324 const uint8_t *mask, int n)
328 s = set1_argb_sse2(*src);
330 for (; n >= 2; n -= 2) {
332 m = load_u8_mask(mask);
333 d = load_argb_sse2(dest);
334 color = muldiv_255_sse2(s, m);
335 d = over_argb_sse2(d, color, argb_A_sse2(color));
336 store_argb_sse2(dest, d);
341 uint32_t d = *dest, m = *mask++, color;
345 COMPOSITE_IN(oil_argb_A(*src), m),
346 COMPOSITE_IN(oil_argb_R(*src), m),
347 COMPOSITE_IN(oil_argb_G(*src), m),
348 COMPOSITE_IN(oil_argb_B(*src), m));
349 srca = oil_argb_A(color);
351 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
352 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
353 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
354 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
358 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
359 composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
361 SSE_FUNCTION static void
362 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
363 const uint8_t *mask, int n)
367 m = set1_u8_mask(*mask);
369 for (; n >= 2; n -= 2) {
371 s = load_argb_sse2(src);
372 d = load_argb_sse2(dest);
373 s = muldiv_255_sse2(s, m);
374 d = over_argb_sse2(d, s, argb_A_sse2(s));
375 store_argb_sse2(dest, d);
380 uint32_t d = *dest, s = *src++, color;
384 COMPOSITE_IN(oil_argb_A(s), *mask),
385 COMPOSITE_IN(oil_argb_R(s), *mask),
386 COMPOSITE_IN(oil_argb_G(s), *mask),
387 COMPOSITE_IN(oil_argb_B(s), *mask));
388 srca = oil_argb_A(color);
390 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
391 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
392 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
393 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
397 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
398 composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
400 SSE_FUNCTION static void
401 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
403 /* Initial operations to align the destination pointer */
404 for (; ((long)dest & 15) && (n > 0); n--) {
405 *dest = COMPOSITE_OVER(*dest, *src, *src);
409 /* over_u8 can be dealt with using our argb code, with srca = s */
410 for (; n >= 8; n -= 8) {
412 d = load_argb_sse2((uint32_t *)dest);
413 s = load_argb_sse2((uint32_t *)src);
414 store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
419 *dest = COMPOSITE_OVER(*dest, *src, *src);
424 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,