Update contrib.
3 * Eric Anholt. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <liboilclasses.h>
31 #include <liboilfunction.h>
32 #include <emmintrin.h>
33 #include "liboil/liboilcolorspace.h"
35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #ifdef ENABLE_BROKEN_IMPLS
44 static const struct _SSEData {
45 union m128_int sse_16xff;
46 union m128_int sse_8x0080;
48 .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL},
49 .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
52 #define MC(x) (c.sse_##x.m128)
54 /* non-SSE2 compositing support */
55 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
56 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
57 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
59 /* This SSE2 code is based around operations on four pixels at a time. The
60 * exception is muldiv_255_sse2, which needs to expand the four pixels into
61 * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
62 * the appropriate rounding on division.
65 /* Shuffles the given value such that the alpha for each pixel appears in each
66 * channel of the pixel.
68 SSE_FUNCTION static inline __m128i
69 argb_A_sse2(__m128i a)
72 /* Shift the alpha channel of each pixel into the low byte */
73 a = _mm_srli_epi32(a, 24);
74 /* Now, shift and or so we can get it into all the channels */
75 a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
76 a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
79 /* Move the alpha channel into the low byte */
80 a = _mm_srli_epi32(a, 24);
81 /* Pack our four alpha channels down into the lower 32 bits */
82 a = _mm_packus_epi16(a, _mm_setzero_si128());
83 a = _mm_packus_epi16(a, _mm_setzero_si128());
84 /* And expand it back out into four pixels of all channels the same */
85 a = _mm_unpacklo_epi8(a, a);
86 return _mm_unpacklo_epi16(a, a);
90 /* Multiplies the unpacked 16-bits-per-channel pixel data in a
91 * channel-by-channel by b, and divides the result by 255, with rounding.
93 SSE_FUNCTION static inline __m128i
94 inner_muldiv_255_sse2(__m128i a, __m128i b)
97 __m128i roundconst = MC(8x0080);
99 ret = _mm_mullo_epi16(a, b);
100 ret = _mm_adds_epu16(ret, roundconst);
101 ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
102 ret = _mm_srli_epi16(ret, 8);
107 SSE_FUNCTION static inline __m128i
108 muldiv_255_sse2(__m128i a, __m128i b)
110 __m128i alow, blow, ahigh, bhigh, low, high;
112 alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
113 blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
114 ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
115 bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
116 low = inner_muldiv_255_sse2(alow, blow);
117 high = inner_muldiv_255_sse2(ahigh, bhigh);
118 return _mm_packus_epi16(low, high);
121 SSE_FUNCTION static inline __m128i
122 negate_argb_sse2(__m128i a)
124 return _mm_xor_si128(a, MC(16xff));
127 SSE_FUNCTION static inline __m128i
128 load_argb_sse2(const uint32_t *src)
130 return _mm_loadu_si128((__m128i *)src);
133 SSE_FUNCTION static inline __m128i
134 set1_argb_sse2(uint32_t src)
136 return _mm_set1_epi32(src);
139 SSE_FUNCTION static inline __m128i
140 load_u8_mask(const uint8_t *m)
143 a = _mm_cvtsi32_si128(*(uint32_t *)m);
144 a = _mm_unpacklo_epi8(a, a);
145 a = _mm_unpacklo_epi16(a, a);
149 SSE_FUNCTION static inline __m128i
150 set1_u8_mask(uint8_t m)
152 return _mm_set1_epi8(m);
155 SSE_FUNCTION static void
156 store_argb_sse2(uint32_t *dest, __m128i pix)
158 _mm_store_si128((__m128i *)dest, pix);
161 SSE_FUNCTION static __m128i
162 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
164 return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
167 SSE_FUNCTION static void
168 composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
171 for (; ((long)dest & 15) && (n > 0); n--) {
176 COMPOSITE_IN(oil_argb_A(s), m),
177 COMPOSITE_IN(oil_argb_R(s), m),
178 COMPOSITE_IN(oil_argb_G(s), m),
179 COMPOSITE_IN(oil_argb_B(s), m));
181 for (; n >= 4; n -= 4) {
183 s = load_argb_sse2(src);
184 m = load_u8_mask(mask);
185 store_argb_sse2(dest, muldiv_255_sse2(s, m));
195 COMPOSITE_IN(oil_argb_A(s), m),
196 COMPOSITE_IN(oil_argb_R(s), m),
197 COMPOSITE_IN(oil_argb_G(s), m),
198 COMPOSITE_IN(oil_argb_B(s), m));
201 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
204 SSE_FUNCTION static void
205 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
206 const uint8_t *mask, int n)
210 s = set1_argb_sse2(*src);
212 for (; ((long)dest & 15) && (n > 0); n--) {
216 COMPOSITE_IN(oil_argb_A(*src), m),
217 COMPOSITE_IN(oil_argb_R(*src), m),
218 COMPOSITE_IN(oil_argb_G(*src), m),
219 COMPOSITE_IN(oil_argb_B(*src), m));
221 for (; n >= 4; n -= 4) {
223 m = load_u8_mask(mask);
224 store_argb_sse2(dest, muldiv_255_sse2(s, m));
232 COMPOSITE_IN(oil_argb_A(*src), m),
233 COMPOSITE_IN(oil_argb_R(*src), m),
234 COMPOSITE_IN(oil_argb_G(*src), m),
235 COMPOSITE_IN(oil_argb_B(*src), m));
238 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
239 composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
241 SSE_FUNCTION static void
242 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
243 const uint8_t *mask, int n)
247 m = set1_u8_mask(*mask);
249 for (; ((long)dest & 15) && (n > 0); n--) {
253 COMPOSITE_IN(oil_argb_A(s), mask[0]),
254 COMPOSITE_IN(oil_argb_R(s), mask[0]),
255 COMPOSITE_IN(oil_argb_G(s), mask[0]),
256 COMPOSITE_IN(oil_argb_B(s), mask[0]));
258 for (; n >= 4; n -= 4) {
260 s = load_argb_sse2(src);
261 store_argb_sse2(dest, muldiv_255_sse2(s, m));
269 COMPOSITE_IN(oil_argb_A(s), mask[0]),
270 COMPOSITE_IN(oil_argb_R(s), mask[0]),
271 COMPOSITE_IN(oil_argb_G(s), mask[0]),
272 COMPOSITE_IN(oil_argb_B(s), mask[0]));
275 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
276 composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
278 SSE_FUNCTION static void
279 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
281 for (; ((long)dest & 15) && (n > 0); n--) {
282 uint32_t d = *dest, s = *src++;
283 uint8_t srca = oil_argb_A(s);
285 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
286 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
287 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
288 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
291 for (; n >= 4; n -= 4) {
293 s = load_argb_sse2(src);
294 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
295 store_argb_sse2(dest, d);
300 uint32_t d = *dest, s = *src++;
301 uint8_t srca = oil_argb_A(s);
303 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
304 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
305 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
306 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
310 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
313 SSE_FUNCTION static void
314 composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
319 srca = oil_argb_A(*src);
320 s = set1_argb_sse2(*src);
321 sa = negate_argb_sse2(argb_A_sse2(s));
322 for (; ((long)dest & 15) && (n > 0); n--) {
325 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
326 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
327 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
328 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
331 for (; n >= 4; n -= 4) {
333 d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
334 store_argb_sse2(dest, d);
340 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
341 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
342 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
343 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
347 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
348 composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
350 SSE_FUNCTION static void
351 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
352 const uint8_t *mask, int n)
354 for (; ((long)dest & 15) && (n > 0); n--) {
355 uint32_t d = *dest, s = *src++, m = *mask++, color;
359 COMPOSITE_IN(oil_argb_A(s), m),
360 COMPOSITE_IN(oil_argb_R(s), m),
361 COMPOSITE_IN(oil_argb_G(s), m),
362 COMPOSITE_IN(oil_argb_B(s), m));
363 srca = oil_argb_A(color);
365 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
366 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
367 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
368 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
371 for (; n >= 4; n -= 4) {
373 s = load_argb_sse2(src);
374 m = load_u8_mask(mask);
375 s = muldiv_255_sse2(s, m);
376 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
377 store_argb_sse2(dest, d);
383 uint32_t d = *dest, s = *src++, m = *mask++, color;
387 COMPOSITE_IN(oil_argb_A(s), m),
388 COMPOSITE_IN(oil_argb_R(s), m),
389 COMPOSITE_IN(oil_argb_G(s), m),
390 COMPOSITE_IN(oil_argb_B(s), m));
391 srca = oil_argb_A(color);
393 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
394 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
395 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
396 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
400 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
403 SSE_FUNCTION static void
404 composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
405 const uint8_t *mask, int n)
409 s = set1_argb_sse2(*src);
411 for (; ((long)dest & 15) && (n > 0); n--) {
412 uint32_t d = *dest, m = *mask++, color;
416 COMPOSITE_IN(oil_argb_A(*src), m),
417 COMPOSITE_IN(oil_argb_R(*src), m),
418 COMPOSITE_IN(oil_argb_G(*src), m),
419 COMPOSITE_IN(oil_argb_B(*src), m));
420 srca = oil_argb_A(color);
422 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
423 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
424 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
425 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
428 for (; n >= 4; n -= 4) {
430 m = load_u8_mask(mask);
431 color = muldiv_255_sse2(s, m);
432 d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
433 store_argb_sse2(dest, d);
438 uint32_t d = *dest, m = *mask++, color;
442 COMPOSITE_IN(oil_argb_A(*src), m),
443 COMPOSITE_IN(oil_argb_R(*src), m),
444 COMPOSITE_IN(oil_argb_G(*src), m),
445 COMPOSITE_IN(oil_argb_B(*src), m));
446 srca = oil_argb_A(color);
448 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
449 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
450 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
451 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
455 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
456 composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
458 SSE_FUNCTION static void
459 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
460 const uint8_t *mask, int n)
464 m = set1_u8_mask(*mask);
466 for (; ((long)dest & 15) && (n > 0); n--) {
467 uint32_t d = *dest, s = *src++, color;
471 COMPOSITE_IN(oil_argb_A(s), *mask),
472 COMPOSITE_IN(oil_argb_R(s), *mask),
473 COMPOSITE_IN(oil_argb_G(s), *mask),
474 COMPOSITE_IN(oil_argb_B(s), *mask));
475 srca = oil_argb_A(color);
477 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
478 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
479 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
480 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
483 for (; n >= 4; n -= 4) {
485 s = load_argb_sse2(src);
486 s = muldiv_255_sse2(s, m);
487 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
488 store_argb_sse2(dest, d);
493 uint32_t d = *dest, s = *src++, color;
497 COMPOSITE_IN(oil_argb_A(s), *mask),
498 COMPOSITE_IN(oil_argb_R(s), *mask),
499 COMPOSITE_IN(oil_argb_G(s), *mask),
500 COMPOSITE_IN(oil_argb_B(s), *mask));
501 srca = oil_argb_A(color);
503 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
504 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
505 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
506 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
510 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
511 composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
513 SSE_FUNCTION static void
514 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
516 /* Initial operations to align the destination pointer */
517 for (; ((long)dest & 15) && (n > 0); n--) {
518 *dest = COMPOSITE_OVER(*dest, *src, *src);
522 /* over_u8 can be dealt with using our argb code, with srca = s */
523 for (; n >= 16; n -= 16) {
525 d = *(__m128i *)dest;
526 s = load_argb_sse2((uint32_t *)src);
527 store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
532 *dest = COMPOSITE_OVER(*dest, *src, *src);
537 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,