os/ossrv/genericopenlibs/liboil/src/math_sse_unroll2.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2  * Copyright (c) 2005
     3  *	Eric Anholt.  All rights reserved.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  *
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    24  * SUCH DAMAGE.
    25  */
    26 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
    27 
    28 #ifdef HAVE_CONFIG_H
    29 #include "config.h"
    30 #endif
    31 #include <liboil/liboilclasses.h>
    32 #include <liboil/liboilfunction.h>
    33 #include <emmintrin.h>
    34 #include <xmmintrin.h>
    35 
    36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
    37 
    38 SSE_FUNCTION static void
    39 add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
    40 {
    41   /* Initial operations to align the destination pointer */
    42   for (; ((long)dest & 15) && (n > 0); n--) {
    43     *dest++ = *src1++ + *src2++;
    44   }
    45   for (; n >= 8; n -= 8) {
    46     __m128 xmm0, xmm1;
    47     xmm0 = _mm_loadu_ps(src1);
    48     xmm1 = _mm_loadu_ps(src2);
    49     xmm0 = _mm_add_ps(xmm0, xmm1);
    50     _mm_store_ps(dest, xmm0);
    51     xmm0 = _mm_loadu_ps(src1 + 4);
    52     xmm1 = _mm_loadu_ps(src2 + 4);
    53     xmm0 = _mm_add_ps(xmm0, xmm1);
    54     _mm_store_ps(dest + 4, xmm0);
    55     dest += 8;
    56     src1 += 8;
    57     src2 += 8;
    58   }
    59   for (; n > 0; n--) {
    60     *dest++ = *src1++ + *src2++;
    61   }
    62 }
    63 OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
    64 
    65 SSE_FUNCTION static void
    66 subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
    67 {
    68   /* Initial operations to align the destination pointer */
    69   for (; ((long)dest & 15) && (n > 0); n--) {
    70     *dest++ = *src1++ - *src2++;
    71   }
    72   for (; n >= 8; n -= 8) {
    73     __m128 xmm0, xmm1;
    74     xmm0 = _mm_loadu_ps(src1);
    75     xmm1 = _mm_loadu_ps(src2);
    76     xmm0 = _mm_sub_ps(xmm0, xmm1);
    77     _mm_store_ps(dest, xmm0);
    78     xmm0 = _mm_loadu_ps(src1 + 4);
    79     xmm1 = _mm_loadu_ps(src2 + 4);
    80     xmm0 = _mm_sub_ps(xmm0, xmm1);
    81     _mm_store_ps(dest + 4, xmm0);
    82     dest += 8;
    83     src1 += 8;
    84     src2 += 8;
    85   }
    86   for (; n > 0; n--) {
    87     *dest++ = *src1++ - *src2++;
    88   }
    89 }
    90 OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
    91 
    92 SSE_FUNCTION static void
    93 multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
    94 {
    95   /* Initial operations to align the destination pointer */
    96   for (; ((long)dest & 15) && (n > 0); n--) {
    97     *dest++ = *src1++ * *src2++;
    98   }
    99   for (; n >= 8; n -= 8) {
   100     __m128 xmm0, xmm1;
   101     xmm0 = _mm_loadu_ps(src1);
   102     xmm1 = _mm_loadu_ps(src2);
   103     xmm0 = _mm_mul_ps(xmm0, xmm1);
   104     _mm_store_ps(dest, xmm0);
   105     xmm0 = _mm_loadu_ps(src1 + 4);
   106     xmm1 = _mm_loadu_ps(src2 + 4);
   107     xmm0 = _mm_mul_ps(xmm0, xmm1);
   108     _mm_store_ps(dest + 4, xmm0);
   109     dest += 8;
   110     src1 += 8;
   111     src2 += 8;
   112   }
   113   for (; n > 0; n--) {
   114     *dest++ = *src1++ * *src2++;
   115   }
   116 }
   117 OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
   118 
   119 SSE_FUNCTION static void
   120 divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
   121 {
   122   /* Initial operations to align the destination pointer */
   123   for (; ((long)dest & 15) && (n > 0); n--) {
   124     *dest++ = *src1++ / *src2++;
   125   }
   126   for (; n >= 8; n -= 8) {
   127     __m128 xmm0, xmm1;
   128     xmm0 = _mm_loadu_ps(src1);
   129     xmm1 = _mm_loadu_ps(src2);
   130     xmm0 = _mm_div_ps(xmm0, xmm1);
   131     _mm_store_ps(dest, xmm0);
   132     xmm0 = _mm_loadu_ps(src1 + 4);
   133     xmm1 = _mm_loadu_ps(src2 + 4);
   134     xmm0 = _mm_div_ps(xmm0, xmm1);
   135     _mm_store_ps(dest + 4, xmm0);
   136     dest += 8;
   137     src1 += 8;
   138     src2 += 8;
   139   }
   140   for (; n > 0; n--) {
   141     *dest++ = *src1++ / *src2++;
   142   }
   143 }
   144 OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
   145 
   146 SSE_FUNCTION static void
   147 minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
   148 {
   149   /* Initial operations to align the destination pointer */
   150   for (; ((long)dest & 15) && (n > 0); n--) {
   151     *dest++ = *src1 < *src2 ? *src1 : *src2;
   152     src1++;
   153     src2++;
   154   }
   155   for (; n >= 8; n -= 8) {
   156     __m128 xmm0, xmm1;
   157     xmm0 = _mm_loadu_ps(src1);
   158     xmm1 = _mm_loadu_ps(src2);
   159     xmm0 = _mm_min_ps(xmm0, xmm1);
   160     _mm_store_ps(dest, xmm0);
   161     xmm0 = _mm_loadu_ps(src1 + 4);
   162     xmm1 = _mm_loadu_ps(src2 + 4);
   163     xmm0 = _mm_min_ps(xmm0, xmm1);
   164     _mm_store_ps(dest + 4, xmm0);
   165     dest += 8;
   166     src1 += 8;
   167     src2 += 8;
   168   }
   169   for (; n > 0; n--) {
   170     *dest++ = *src1 < *src2 ? *src1 : *src2;
   171     src1++;
   172     src2++;
   173   }
   174 }
   175 OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
   176 
   177 SSE_FUNCTION static void
   178 maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
   179 {
   180   /* Initial operations to align the destination pointer */
   181   for (; ((long)dest & 15) && (n > 0); n--) {
   182     *dest++ = *src1 > *src2 ? *src1 : *src2;
   183     src1++;
   184     src2++;
   185   }
   186   for (; n >= 8; n -= 8) {
   187     __m128 xmm0, xmm1;
   188     xmm0 = _mm_loadu_ps(src1);
   189     xmm1 = _mm_loadu_ps(src2);
   190     xmm0 = _mm_max_ps(xmm0, xmm1);
   191     _mm_store_ps(dest, xmm0);
   192     xmm0 = _mm_loadu_ps(src1 + 4);
   193     xmm1 = _mm_loadu_ps(src2 + 4);
   194     xmm0 = _mm_max_ps(xmm0, xmm1);
   195     _mm_store_ps(dest + 4, xmm0);
   196     dest += 8;
   197     src1 += 8;
   198     src2 += 8;
   199   }
   200   for (; n > 0; n--) {
   201     *dest++ = *src1 > *src2 ? *src1 : *src2;
   202     src1++;
   203     src2++;
   204   }
   205 }
   206 OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
   207 
   208 SSE_FUNCTION static void
   209 inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
   210 {
   211   /* Initial operations to align the destination pointer */
   212   for (; ((long)dest & 15) && (n > 0); n--) {
   213     *dest++ = 1.0 / *src1++;
   214   }
   215   for (; n >= 8; n -= 8) {
   216     __m128 xmm0, xmm1;
   217     /* While _mm_rcp_ps sounds promising, the results it gives are rather
   218      * different from the 1.0 / src1 reference implementation, so do that.
   219      */
   220     xmm0 = _mm_set_ps1(1.0);
   221     xmm1 = _mm_loadu_ps(src1);
   222     xmm0 = _mm_div_ps(xmm0, xmm1);
   223     _mm_store_ps(dest, xmm0);
   224     xmm0 = _mm_set_ps1(1.0);
   225     xmm1 = _mm_loadu_ps(src1 + 4);
   226     xmm0 = _mm_div_ps(xmm0, xmm1);
   227     _mm_store_ps(dest + 4, xmm0);
   228     dest += 8;
   229     src1 += 8;
   230   }
   231   for (; n > 0; n--) {
   232     *dest++ = 1.0 / *src1++;
   233   }
   234 }
   235 OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
   236 
   237 SSE_FUNCTION static void
   238 negative_f32_sse_unroll2 (float *dest, float *src1, int n)
   239 {
   240   /* Initial operations to align the destination pointer */
   241   for (; ((long)dest & 15) && (n > 0); n--) {
   242     *dest++ = -(*src1++);
   243   }
   244   for (; n >= 8; n -= 8) {
   245     __m128 xmm0, xmm1;
   246     xmm0 = _mm_setzero_ps();
   247     xmm1 = _mm_loadu_ps(src1);
   248     xmm0 = _mm_sub_ps(xmm0, xmm1);
   249     _mm_store_ps(dest, xmm0);
   250     xmm0 = _mm_setzero_ps();
   251     xmm1 = _mm_loadu_ps(src1 + 4);
   252     xmm0 = _mm_sub_ps(xmm0, xmm1);
   253     _mm_store_ps(dest + 4, xmm0);
   254     dest += 8;
   255     src1 += 8;
   256   }
   257   for (; n > 0; n--) {
   258     *dest++ = -(*src1++);
   259   }
   260 }
   261 OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
   262 
   263 SSE_FUNCTION static void
   264 scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
   265 {
   266   __m128 xmm1;
   267 
   268   /* Initial operations to align the destination pointer */
   269   for (; ((long)dest & 15) && (n > 0); n--) {
   270     *dest++ = *src1++ + *val;
   271   }
   272   xmm1 = _mm_load_ps1(val);
   273   for (; n >= 8; n -= 8) {
   274     __m128 xmm0;
   275     xmm0 = _mm_loadu_ps(src1);
   276     xmm0 = _mm_add_ps(xmm0, xmm1);
   277     _mm_store_ps(dest, xmm0);
   278     xmm0 = _mm_loadu_ps(src1 + 4);
   279     xmm0 = _mm_add_ps(xmm0, xmm1);
   280     _mm_store_ps(dest + 4, xmm0);
   281     dest += 8;
   282     src1 += 8;
   283   }
   284   for (; n > 0; n--) {
   285     *dest++ = *src1++ + *val;
   286   }
   287 }
   288 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
   289 
   290 SSE_FUNCTION static void
   291 scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
   292 {
   293   __m128 xmm1;
   294 
   295   /* Initial operations to align the destination pointer */
   296   for (; ((long)dest & 15) && (n > 0); n--) {
   297     *dest++ = *src1++ * *val;
   298   }
   299   xmm1 = _mm_load_ps1(val);
   300   for (; n >= 8; n -= 8) {
   301     __m128 xmm0;
   302     xmm0 = _mm_loadu_ps(src1);
   303     xmm0 = _mm_mul_ps(xmm0, xmm1);
   304     _mm_store_ps(dest, xmm0);
   305     xmm0 = _mm_loadu_ps(src1 + 4);
   306     xmm0 = _mm_mul_ps(xmm0, xmm1);
   307     _mm_store_ps(dest + 4, xmm0);
   308     dest += 8;
   309     src1 += 8;
   310   }
   311   for (; n > 0; n--) {
   312     *dest++ = *src1++ * *val;
   313   }
   314 }
   315 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
   316 
   317 SSE_FUNCTION static void
   318 scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
   319 {
   320   __m128d xmm1;
   321 
   322   /* Initial operations to align the destination pointer */
   323   for (; ((long)dest & 15) && (n > 0); n--) {
   324     *dest++ = *src1++ * *val;
   325   }
   326   xmm1 = _mm_load_pd1(val);
   327   for (; n >= 4; n -= 4) {
   328     __m128d xmm0;
   329     xmm0 = _mm_loadu_pd(src1);
   330     xmm0 = _mm_mul_pd(xmm0, xmm1);
   331     _mm_store_pd(dest, xmm0);
   332     xmm0 = _mm_loadu_pd(src1 + 2);
   333     xmm0 = _mm_mul_pd(xmm0, xmm1);
   334     _mm_store_pd(dest + 2, xmm0);
   335     dest += 4;
   336     src1 += 4;
   337   }
   338   for (; n > 0; n--) {
   339     *dest++ = *src1++ * *val;
   340   }
   341 }
   342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
   343 
   344 
   345 
   346 #ifdef	__SYMBIAN32__
   347  
   348 OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() {
   349 		return &_oil_function_impl_add_f32_sse_unroll2, add_f32;
   350 }
   351 #endif
   352 
   353 #ifdef	__SYMBIAN32__
   354  
   355 OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() {
   356 		return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32;
   357 }
   358 #endif
   359 
   360 #ifdef	__SYMBIAN32__
   361  
   362 OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() {
   363 		return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32;
   364 }
   365 #endif
   366 
   367 #ifdef	__SYMBIAN32__
   368  
   369 OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() {
   370 		return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32;
   371 }
   372 #endif
   373 
   374 #ifdef	__SYMBIAN32__
   375  
   376 OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() {
   377 		return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32;
   378 }
   379 #endif
   380 
   381 #ifdef	__SYMBIAN32__
   382  
   383 OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() {
   384 		return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32;
   385 }
   386 #endif
   387 
   388 #ifdef	__SYMBIAN32__
   389  
   390 OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() {
   391 		return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32;
   392 }
   393 #endif
   394 
   395 #ifdef	__SYMBIAN32__
   396  
   397 OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() {
   398 		return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32;
   399 }
   400 #endif
   401 
   402 #ifdef	__SYMBIAN32__
   403  
   404 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() {
   405 		return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns;
   406 }
   407 #endif
   408 
   409 #ifdef	__SYMBIAN32__
   410  
   411 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() {
   412 		return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns;
   413 }
   414 #endif
   415 
   416 #ifdef	__SYMBIAN32__
   417  
   418 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() {
   419 		return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns;
   420 }
   421 #endif
   422