os/ossrv/genericopenlibs/liboil/src/composite_sse_2pix.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2  * Copyright (c) 2005
     3  *	Eric Anholt.  All rights reserved.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  *
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    24  * SUCH DAMAGE.
    25  */
    26 
    27 #ifdef HAVE_CONFIG_H
    28 #include "config.h"
    29 #endif
    30 #include <liboilclasses.h>
    31 #include <liboilfunction.h>
    32 #include <emmintrin.h>
    33 #include "liboil/liboilcolorspace.h"
    34 
    35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
    36 
    37 /* non-SSE2 compositing support */
    38 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
    39 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
    40 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
    41 
    42 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
    43  * the channel value in the low byte.  This means 2 pixels per pass.
    44  */
    45 
    46 #ifdef ENABLE_BROKEN_IMPLS
    47 
    48 union m128_int {
    49   __m128i m128;
    50   uint64_t ull[2];
    51 };
    52 
    53 static const struct _SSEData {
    54   union m128_int sse_8x00ff;
    55   union m128_int sse_8x0080;
    56 } c = {
    57     .sse_8x00ff.ull =	{0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
    58     .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
    59 };
    60 
    61 #define MC(x) (c.sse_##x.m128)
    62 
    63 /* Shuffles the given value such that the alpha for each pixel appears in each
    64  * channel of the pixel.
    65  */
    66 SSE_FUNCTION static inline __m128i
    67 argb_A_sse2(__m128i a)
    68 {
    69   a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
    70   a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
    71   return a;
    72 }
    73 
    74 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
    75  * result by 255, with rounding.
    76  */
    77 SSE_FUNCTION static inline __m128i
    78 muldiv_255_sse2(__m128i a, __m128i b)
    79 {
    80   __m128i ret;
    81   __m128i roundconst = MC(8x0080);
    82 
    83   ret = _mm_mullo_epi16(a, b);
    84   ret = _mm_adds_epu16(ret, roundconst);
    85   ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
    86   ret = _mm_srli_epi16(ret, 8);
    87 
    88   return ret;
    89 }
    90 
    91 SSE_FUNCTION static inline __m128i
    92 negate_argb_sse2(__m128i a)
    93 {
    94   return _mm_xor_si128(a, MC(8x00ff));
    95 }
    96 
    97 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
    98 SSE_FUNCTION static inline __m128i
    99 load_argb_sse2(const uint32_t *src)
   100 {
   101   __m128i pix;
   102 
   103   pix = _mm_loadl_epi64((__m128i *)src);
   104   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
   105   return pix;
   106 }
   107 
   108 SSE_FUNCTION static inline __m128i
   109 set1_argb_sse2(uint32_t src)
   110 {
   111   __m128i pix;
   112 
   113   pix = _mm_set1_epi32(src);
   114   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
   115   return pix;
   116 }
   117 
   118 SSE_FUNCTION static inline __m128i
   119 load_u8_mask(const uint8_t *m)
   120 {
   121   return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
   122 }
   123 
   124 SSE_FUNCTION static inline __m128i
   125 set1_u8_mask(uint8_t m)
   126 {
   127   return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
   128 }
   129 
   130 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
   131 SSE_FUNCTION static void
   132 store_argb_sse2(uint32_t *dest, __m128i pix)
   133 {
   134   pix = _mm_packus_epi16(pix, pix);
   135   _mm_storel_epi64((__m128i *)dest, pix);
   136 }
   137 
   138 SSE_FUNCTION static __m128i 
   139 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
   140 {
   141   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
   142 }
   143 
   144 SSE_FUNCTION static void
   145 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
   146     const uint8_t *mask, int n)
   147 {
   148   for (; n >= 2; n -= 2) {
   149     __m128i s, m;
   150     s = load_argb_sse2(src);
   151     m = load_u8_mask(mask);
   152     store_argb_sse2(dest, muldiv_255_sse2(s, m));
   153     src += 2;
   154     mask += 2;
   155     dest += 2;
   156   }
   157   for (; n > 0; n--) {
   158     uint32_t s = *src++;
   159     uint8_t m = *mask++;
   160 
   161     *dest++ = oil_argb(
   162 	COMPOSITE_IN(oil_argb_A(s), m),
   163 	COMPOSITE_IN(oil_argb_R(s), m),
   164 	COMPOSITE_IN(oil_argb_G(s), m),
   165 	COMPOSITE_IN(oil_argb_B(s), m));
   166   }
   167 }
   168 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
   169     OIL_IMPL_FLAG_SSE2);
   170 
   171 SSE_FUNCTION static void
   172 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
   173     const uint8_t *mask, int n)
   174 {
   175   __m128i s;
   176 
   177   s = set1_argb_sse2(*src);
   178 
   179   for (; n >= 2; n -= 2) {
   180     __m128i m;
   181     m = load_u8_mask(mask);
   182     store_argb_sse2(dest, muldiv_255_sse2(s, m));
   183     mask += 2;
   184     dest += 2;
   185   }
   186   for (; n > 0; n--) {
   187     uint8_t m = *mask++;
   188 
   189     *dest++ = oil_argb(
   190 	COMPOSITE_IN(oil_argb_A(*src), m),
   191 	COMPOSITE_IN(oil_argb_R(*src), m),
   192 	COMPOSITE_IN(oil_argb_G(*src), m),
   193 	COMPOSITE_IN(oil_argb_B(*src), m));
   194   }
   195 }
   196 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
   197     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
   198 
   199 #ifdef SSE_ALIGN
   200 SSE_FUNCTION static void
   201 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
   202     const uint8_t *mask, int n)
   203 {
   204   __m128i m;
   205 
   206   m = set1_u8_mask(*mask);
   207 
   208   for (; n >= 2; n -= 2) {
   209     __m128i s;
   210     s = load_argb_sse2(src);
   211     store_argb_sse2(dest,  muldiv_255_sse2(s, m));
   212     src += 2;
   213     dest += 2;
   214   }
   215   for (; n > 0; n--) {
   216     uint32_t s = *src++;
   217 
   218     *dest++ = oil_argb(
   219 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
   220 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
   221 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
   222 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
   223   }
   224 }
   225 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
   226     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
   227 #endif
   228 
   229 SSE_FUNCTION static void
   230 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
   231 {
   232   for (; n >= 2; n -= 2) {
   233     __m128i d, s;
   234     s = load_argb_sse2(src);
   235     d = load_argb_sse2(dest);
   236     d = over_argb_sse2(d, s, argb_A_sse2(s));
   237     store_argb_sse2(dest, d);
   238     src += 2;
   239     dest += 2;
   240   }
   241   for (; n > 0; n--) {
   242     uint32_t d = *dest, s = *src++;
   243     uint8_t srca = oil_argb_A(s);
   244     d = oil_argb(
   245 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
   246 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
   247 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
   248 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
   249     *dest++ = d;
   250   }
   251 }
   252 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
   253     OIL_IMPL_FLAG_SSE2);
   254 
   255 SSE_FUNCTION static void
   256 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
   257     int n)
   258 {
   259   __m128i s, sa;
   260   uint32_t srca;
   261 
   262   srca = oil_argb_A(*src);
   263   s = set1_argb_sse2(*src);
   264   sa = negate_argb_sse2(argb_A_sse2(s));
   265   for (; n >= 2; n -= 2) {
   266     __m128i d;
   267     d = load_argb_sse2(dest);
   268     d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
   269     store_argb_sse2(dest, d);
   270     dest += 2;
   271   }
   272   for (; n > 0; n--) {
   273     uint32_t d = *dest;
   274     d = oil_argb(
   275 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
   276 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
   277 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
   278 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
   279     *dest++ = d;
   280   }
   281 }
   282 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
   283     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
   284 
   285 SSE_FUNCTION static void
   286 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
   287     const uint8_t *mask, int n)
   288 {
   289   for (; n >= 2; n -= 2) {
   290     __m128i d, s, m;
   291     s = load_argb_sse2(src);
   292     m = load_u8_mask(mask);
   293     d = load_argb_sse2(dest);
   294     s = muldiv_255_sse2(s, m);
   295     d = over_argb_sse2(d, s, argb_A_sse2(s));
   296     store_argb_sse2(dest, d);
   297     src += 2;
   298     mask += 2;
   299     dest += 2;
   300   }
   301   for (; n > 0; n--) {
   302     uint32_t d = *dest, s = *src++, m = *mask++, color;
   303     uint8_t srca;
   304 
   305     color = oil_argb(
   306         COMPOSITE_IN(oil_argb_A(s), m),
   307         COMPOSITE_IN(oil_argb_R(s), m),
   308         COMPOSITE_IN(oil_argb_G(s), m),
   309         COMPOSITE_IN(oil_argb_B(s), m));
   310     srca = oil_argb_A(color);
   311     d = oil_argb(
   312 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   313 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   314 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   315 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   316     *dest++ = d;
   317   }
   318 }
   319 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
   320     OIL_IMPL_FLAG_SSE2);
   321 
   322 SSE_FUNCTION static void
   323 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
   324     const uint8_t *mask, int n)
   325 {
   326   __m128i s;
   327 
   328   s = set1_argb_sse2(*src);
   329 
   330   for (; n >= 2; n -= 2) {
   331     __m128i d, color, m;
   332     m = load_u8_mask(mask);
   333     d = load_argb_sse2(dest);
   334     color = muldiv_255_sse2(s, m);
   335     d = over_argb_sse2(d, color, argb_A_sse2(color));
   336     store_argb_sse2(dest, d);
   337     mask += 2;
   338     dest += 2;
   339   }
   340   for (; n > 0; n--) {
   341     uint32_t d = *dest, m = *mask++, color;
   342     uint8_t srca;
   343 
   344     color = oil_argb(
   345         COMPOSITE_IN(oil_argb_A(*src), m),
   346         COMPOSITE_IN(oil_argb_R(*src), m),
   347         COMPOSITE_IN(oil_argb_G(*src), m),
   348         COMPOSITE_IN(oil_argb_B(*src), m));
   349     srca = oil_argb_A(color);
   350     d = oil_argb(
   351 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   352 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   353 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   354 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   355     *dest++ = d;
   356   }
   357 }
   358 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
   359     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
   360 
   361 SSE_FUNCTION static void
   362 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
   363     const uint8_t *mask, int n)
   364 {
   365   __m128i m;
   366 
   367   m = set1_u8_mask(*mask);
   368 
   369   for (; n >= 2; n -= 2) {
   370     __m128i d, s;
   371     s = load_argb_sse2(src);
   372     d = load_argb_sse2(dest);
   373     s = muldiv_255_sse2(s, m);
   374     d = over_argb_sse2(d, s, argb_A_sse2(s));
   375     store_argb_sse2(dest, d);
   376     src += 2;
   377     dest += 2;
   378   }
   379   for (; n > 0; n--) {
   380     uint32_t d = *dest, s = *src++, color;
   381     uint8_t srca;
   382 
   383     color = oil_argb(
   384         COMPOSITE_IN(oil_argb_A(s), *mask),
   385         COMPOSITE_IN(oil_argb_R(s), *mask),
   386         COMPOSITE_IN(oil_argb_G(s), *mask),
   387         COMPOSITE_IN(oil_argb_B(s), *mask));
   388     srca = oil_argb_A(color);
   389     d = oil_argb(
   390 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
   391 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
   392 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
   393 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
   394     *dest++ = d;
   395   }
   396 }
   397 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
   398     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
   399 
   400 SSE_FUNCTION static void
   401 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
   402 {
   403   /* Initial operations to align the destination pointer */
   404   for (; ((long)dest & 15) && (n > 0); n--) {
   405     *dest = COMPOSITE_OVER(*dest, *src, *src);
   406     src++;
   407     dest++;
   408   }
   409   /* over_u8 can be dealt with using our argb code, with srca = s */
   410   for (; n >= 8; n -= 8) {
   411     __m128i d, s;
   412     d = load_argb_sse2((uint32_t *)dest);
   413     s = load_argb_sse2((uint32_t *)src);
   414     store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
   415     src += 8;
   416     dest += 8;
   417   }
   418   for (; n > 0; n--) {
   419     *dest = COMPOSITE_OVER(*dest, *src, *src);
   420     src++;
   421     dest++;
   422   }
   423 }
   424 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
   425     OIL_IMPL_FLAG_SSE2);
   426 #endif
   427