1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/composite_sse_2pix.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,427 @@
1.4 +/*
1.5 + * Copyright (c) 2005
1.6 + * Eric Anholt. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +
1.30 +#ifdef HAVE_CONFIG_H
1.31 +#include "config.h"
1.32 +#endif
1.33 +#include <liboilclasses.h>
1.34 +#include <liboilfunction.h>
1.35 +#include <emmintrin.h>
1.36 +#include "liboil/liboilcolorspace.h"
1.37 +
1.38 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
1.39 +
1.40 +/* non-SSE2 compositing support */
1.41 +#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
1.42 +#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
1.43 +#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
1.44 +
1.45 +/* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
1.46 + * the channel value in the low byte. This means 2 pixels per pass.
1.47 + */
1.48 +
1.49 +#ifdef ENABLE_BROKEN_IMPLS
1.50 +
1.51 +union m128_int {
1.52 + __m128i m128;
1.53 + uint64_t ull[2];
1.54 +};
1.55 +
1.56 +static const struct _SSEData {
1.57 + union m128_int sse_8x00ff;
1.58 + union m128_int sse_8x0080;
1.59 +} c = {
1.60 + .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
1.61 + .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
1.62 +};
1.63 +
1.64 +#define MC(x) (c.sse_##x.m128)
1.65 +
1.66 +/* Shuffles the given value such that the alpha for each pixel appears in each
1.67 + * channel of the pixel.
1.68 + */
1.69 +SSE_FUNCTION static inline __m128i
1.70 +argb_A_sse2(__m128i a)
1.71 +{
1.72 + a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
1.73 + a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
1.74 + return a;
1.75 +}
1.76 +
1.77 +/* Multiplies the pixel data in a channel-by-channel by b, and divides the
1.78 + * result by 255, with rounding.
1.79 + */
1.80 +SSE_FUNCTION static inline __m128i
1.81 +muldiv_255_sse2(__m128i a, __m128i b)
1.82 +{
1.83 + __m128i ret;
1.84 + __m128i roundconst = MC(8x0080);
1.85 +
1.86 + ret = _mm_mullo_epi16(a, b);
1.87 + ret = _mm_adds_epu16(ret, roundconst);
1.88 + ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
1.89 + ret = _mm_srli_epi16(ret, 8);
1.90 +
1.91 + return ret;
1.92 +}
1.93 +
1.94 +SSE_FUNCTION static inline __m128i
1.95 +negate_argb_sse2(__m128i a)
1.96 +{
1.97 + return _mm_xor_si128(a, MC(8x00ff));
1.98 +}
1.99 +
1.100 +/* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
1.101 +SSE_FUNCTION static inline __m128i
1.102 +load_argb_sse2(const uint32_t *src)
1.103 +{
1.104 + __m128i pix;
1.105 +
1.106 + pix = _mm_loadl_epi64((__m128i *)src);
1.107 + pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
1.108 + return pix;
1.109 +}
1.110 +
1.111 +SSE_FUNCTION static inline __m128i
1.112 +set1_argb_sse2(uint32_t src)
1.113 +{
1.114 + __m128i pix;
1.115 +
1.116 + pix = _mm_set1_epi32(src);
1.117 + pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
1.118 + return pix;
1.119 +}
1.120 +
1.121 +SSE_FUNCTION static inline __m128i
1.122 +load_u8_mask(const uint8_t *m)
1.123 +{
1.124 + return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
1.125 +}
1.126 +
1.127 +SSE_FUNCTION static inline __m128i
1.128 +set1_u8_mask(uint8_t m)
1.129 +{
1.130 + return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
1.131 +}
1.132 +
1.133 +/* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
1.134 +SSE_FUNCTION static void
1.135 +store_argb_sse2(uint32_t *dest, __m128i pix)
1.136 +{
1.137 + pix = _mm_packus_epi16(pix, pix);
1.138 + _mm_storel_epi64((__m128i *)dest, pix);
1.139 +}
1.140 +
1.141 +SSE_FUNCTION static __m128i
1.142 +over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
1.143 +{
1.144 + return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
1.145 +}
1.146 +
1.147 +SSE_FUNCTION static void
1.148 +composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
1.149 + const uint8_t *mask, int n)
1.150 +{
1.151 + for (; n >= 2; n -= 2) {
1.152 + __m128i s, m;
1.153 + s = load_argb_sse2(src);
1.154 + m = load_u8_mask(mask);
1.155 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.156 + src += 2;
1.157 + mask += 2;
1.158 + dest += 2;
1.159 + }
1.160 + for (; n > 0; n--) {
1.161 + uint32_t s = *src++;
1.162 + uint8_t m = *mask++;
1.163 +
1.164 + *dest++ = oil_argb(
1.165 + COMPOSITE_IN(oil_argb_A(s), m),
1.166 + COMPOSITE_IN(oil_argb_R(s), m),
1.167 + COMPOSITE_IN(oil_argb_G(s), m),
1.168 + COMPOSITE_IN(oil_argb_B(s), m));
1.169 + }
1.170 +}
1.171 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
1.172 + OIL_IMPL_FLAG_SSE2);
1.173 +
1.174 +SSE_FUNCTION static void
1.175 +composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
1.176 + const uint8_t *mask, int n)
1.177 +{
1.178 + __m128i s;
1.179 +
1.180 + s = set1_argb_sse2(*src);
1.181 +
1.182 + for (; n >= 2; n -= 2) {
1.183 + __m128i m;
1.184 + m = load_u8_mask(mask);
1.185 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.186 + mask += 2;
1.187 + dest += 2;
1.188 + }
1.189 + for (; n > 0; n--) {
1.190 + uint8_t m = *mask++;
1.191 +
1.192 + *dest++ = oil_argb(
1.193 + COMPOSITE_IN(oil_argb_A(*src), m),
1.194 + COMPOSITE_IN(oil_argb_R(*src), m),
1.195 + COMPOSITE_IN(oil_argb_G(*src), m),
1.196 + COMPOSITE_IN(oil_argb_B(*src), m));
1.197 + }
1.198 +}
1.199 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
1.200 + composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.201 +
1.202 +#ifdef SSE_ALIGN
1.203 +SSE_FUNCTION static void
1.204 +composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
1.205 + const uint8_t *mask, int n)
1.206 +{
1.207 + __m128i m;
1.208 +
1.209 + m = set1_u8_mask(*mask);
1.210 +
1.211 + for (; n >= 2; n -= 2) {
1.212 + __m128i s;
1.213 + s = load_argb_sse2(src);
1.214 + store_argb_sse2(dest, muldiv_255_sse2(s, m));
1.215 + src += 2;
1.216 + dest += 2;
1.217 + }
1.218 + for (; n > 0; n--) {
1.219 + uint32_t s = *src++;
1.220 +
1.221 + *dest++ = oil_argb(
1.222 + COMPOSITE_IN(oil_argb_A(s), mask[0]),
1.223 + COMPOSITE_IN(oil_argb_R(s), mask[0]),
1.224 + COMPOSITE_IN(oil_argb_G(s), mask[0]),
1.225 + COMPOSITE_IN(oil_argb_B(s), mask[0]));
1.226 + }
1.227 +}
1.228 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
1.229 + composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
1.230 +#endif
1.231 +
1.232 +SSE_FUNCTION static void
1.233 +composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
1.234 +{
1.235 + for (; n >= 2; n -= 2) {
1.236 + __m128i d, s;
1.237 + s = load_argb_sse2(src);
1.238 + d = load_argb_sse2(dest);
1.239 + d = over_argb_sse2(d, s, argb_A_sse2(s));
1.240 + store_argb_sse2(dest, d);
1.241 + src += 2;
1.242 + dest += 2;
1.243 + }
1.244 + for (; n > 0; n--) {
1.245 + uint32_t d = *dest, s = *src++;
1.246 + uint8_t srca = oil_argb_A(s);
1.247 + d = oil_argb(
1.248 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
1.249 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
1.250 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
1.251 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
1.252 + *dest++ = d;
1.253 + }
1.254 +}
1.255 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
1.256 + OIL_IMPL_FLAG_SSE2);
1.257 +
1.258 +SSE_FUNCTION static void
1.259 +composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
1.260 + int n)
1.261 +{
1.262 + __m128i s, sa;
1.263 + uint32_t srca;
1.264 +
1.265 + srca = oil_argb_A(*src);
1.266 + s = set1_argb_sse2(*src);
1.267 + sa = negate_argb_sse2(argb_A_sse2(s));
1.268 + for (; n >= 2; n -= 2) {
1.269 + __m128i d;
1.270 + d = load_argb_sse2(dest);
1.271 + d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
1.272 + store_argb_sse2(dest, d);
1.273 + dest += 2;
1.274 + }
1.275 + for (; n > 0; n--) {
1.276 + uint32_t d = *dest;
1.277 + d = oil_argb(
1.278 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
1.279 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
1.280 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
1.281 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
1.282 + *dest++ = d;
1.283 + }
1.284 +}
1.285 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
1.286 + composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.287 +
1.288 +SSE_FUNCTION static void
1.289 +composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
1.290 + const uint8_t *mask, int n)
1.291 +{
1.292 + for (; n >= 2; n -= 2) {
1.293 + __m128i d, s, m;
1.294 + s = load_argb_sse2(src);
1.295 + m = load_u8_mask(mask);
1.296 + d = load_argb_sse2(dest);
1.297 + s = muldiv_255_sse2(s, m);
1.298 + d = over_argb_sse2(d, s, argb_A_sse2(s));
1.299 + store_argb_sse2(dest, d);
1.300 + src += 2;
1.301 + mask += 2;
1.302 + dest += 2;
1.303 + }
1.304 + for (; n > 0; n--) {
1.305 + uint32_t d = *dest, s = *src++, m = *mask++, color;
1.306 + uint8_t srca;
1.307 +
1.308 + color = oil_argb(
1.309 + COMPOSITE_IN(oil_argb_A(s), m),
1.310 + COMPOSITE_IN(oil_argb_R(s), m),
1.311 + COMPOSITE_IN(oil_argb_G(s), m),
1.312 + COMPOSITE_IN(oil_argb_B(s), m));
1.313 + srca = oil_argb_A(color);
1.314 + d = oil_argb(
1.315 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.316 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.317 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.318 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.319 + *dest++ = d;
1.320 + }
1.321 +}
1.322 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
1.323 + OIL_IMPL_FLAG_SSE2);
1.324 +
1.325 +SSE_FUNCTION static void
1.326 +composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
1.327 + const uint8_t *mask, int n)
1.328 +{
1.329 + __m128i s;
1.330 +
1.331 + s = set1_argb_sse2(*src);
1.332 +
1.333 + for (; n >= 2; n -= 2) {
1.334 + __m128i d, color, m;
1.335 + m = load_u8_mask(mask);
1.336 + d = load_argb_sse2(dest);
1.337 + color = muldiv_255_sse2(s, m);
1.338 + d = over_argb_sse2(d, color, argb_A_sse2(color));
1.339 + store_argb_sse2(dest, d);
1.340 + mask += 2;
1.341 + dest += 2;
1.342 + }
1.343 + for (; n > 0; n--) {
1.344 + uint32_t d = *dest, m = *mask++, color;
1.345 + uint8_t srca;
1.346 +
1.347 + color = oil_argb(
1.348 + COMPOSITE_IN(oil_argb_A(*src), m),
1.349 + COMPOSITE_IN(oil_argb_R(*src), m),
1.350 + COMPOSITE_IN(oil_argb_G(*src), m),
1.351 + COMPOSITE_IN(oil_argb_B(*src), m));
1.352 + srca = oil_argb_A(color);
1.353 + d = oil_argb(
1.354 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.355 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.356 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.357 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.358 + *dest++ = d;
1.359 + }
1.360 +}
1.361 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
1.362 + composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
1.363 +
1.364 +SSE_FUNCTION static void
1.365 +composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
1.366 + const uint8_t *mask, int n)
1.367 +{
1.368 + __m128i m;
1.369 +
1.370 + m = set1_u8_mask(*mask);
1.371 +
1.372 + for (; n >= 2; n -= 2) {
1.373 + __m128i d, s;
1.374 + s = load_argb_sse2(src);
1.375 + d = load_argb_sse2(dest);
1.376 + s = muldiv_255_sse2(s, m);
1.377 + d = over_argb_sse2(d, s, argb_A_sse2(s));
1.378 + store_argb_sse2(dest, d);
1.379 + src += 2;
1.380 + dest += 2;
1.381 + }
1.382 + for (; n > 0; n--) {
1.383 + uint32_t d = *dest, s = *src++, color;
1.384 + uint8_t srca;
1.385 +
1.386 + color = oil_argb(
1.387 + COMPOSITE_IN(oil_argb_A(s), *mask),
1.388 + COMPOSITE_IN(oil_argb_R(s), *mask),
1.389 + COMPOSITE_IN(oil_argb_G(s), *mask),
1.390 + COMPOSITE_IN(oil_argb_B(s), *mask));
1.391 + srca = oil_argb_A(color);
1.392 + d = oil_argb(
1.393 + COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
1.394 + COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
1.395 + COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
1.396 + COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
1.397 + *dest++ = d;
1.398 + }
1.399 +}
1.400 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
1.401 + composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
1.402 +
1.403 +SSE_FUNCTION static void
1.404 +composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
1.405 +{
1.406 + /* Initial operations to align the destination pointer */
1.407 + for (; ((long)dest & 15) && (n > 0); n--) {
1.408 + *dest = COMPOSITE_OVER(*dest, *src, *src);
1.409 + src++;
1.410 + dest++;
1.411 + }
1.412 + /* over_u8 can be dealt with using our argb code, with srca = s */
1.413 + for (; n >= 8; n -= 8) {
1.414 + __m128i d, s;
1.415 + d = load_argb_sse2((uint32_t *)dest);
1.416 + s = load_argb_sse2((uint32_t *)src);
1.417 + store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
1.418 + src += 8;
1.419 + dest += 8;
1.420 + }
1.421 + for (; n > 0; n--) {
1.422 + *dest = COMPOSITE_OVER(*dest, *src, *src);
1.423 + src++;
1.424 + dest++;
1.425 + }
1.426 +}
1.427 +OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
1.428 + OIL_IMPL_FLAG_SSE2);
1.429 +#endif
1.430 +