os/ossrv/genericopenlibs/liboil/src/math_sse.c
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/math_sse.c	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,460 @@
     1.4 +/*
     1.5 + * Copyright (c) 2005
     1.6 + *	Eric Anholt.  All rights reserved.
     1.7 + *
     1.8 + * Redistribution and use in source and binary forms, with or without
     1.9 + * modification, are permitted provided that the following conditions
    1.10 + * are met:
    1.11 + * 1. Redistributions of source code must retain the above copyright
    1.12 + *    notice, this list of conditions and the following disclaimer.
    1.13 + * 2. Redistributions in binary form must reproduce the above copyright
    1.14 + *    notice, this list of conditions and the following disclaimer in the
    1.15 + *    documentation and/or other materials provided with the distribution.
    1.16 + *
    1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    1.20 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    1.27 + * SUCH DAMAGE.
    1.28 + */
    1.29 +//Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
    1.30 +
    1.31 +#ifdef HAVE_CONFIG_H
    1.32 +#include "config.h"
    1.33 +#endif
    1.34 +#include <liboilclasses.h>
    1.35 +#include <liboilfunction.h>
    1.36 +#include <emmintrin.h>
    1.37 +#include <xmmintrin.h>
    1.38 +
    1.39 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
    1.40 +
    1.41 +SSE_FUNCTION static void
    1.42 +add_f32_sse (float *dest, float *src1, float *src2, int n)
    1.43 +{
    1.44 +  /* Initial operations to align the destination pointer */
    1.45 +  for (; ((long)dest & 15) && (n > 0); n--) {
    1.46 +    *dest++ = *src1++ + *src2++;
    1.47 +  }
    1.48 +  for (; n >= 4; n -= 4) {
    1.49 +    __m128 xmm0, xmm1;
    1.50 +    xmm0 = _mm_loadu_ps(src1);
    1.51 +    xmm1 = _mm_loadu_ps(src2);
    1.52 +    xmm0 = _mm_add_ps(xmm0, xmm1);
    1.53 +    _mm_store_ps(dest, xmm0);
    1.54 +    dest += 4;
    1.55 +    src1 += 4;
    1.56 +    src2 += 4;
    1.57 +  }
    1.58 +  for (; n > 0; n--) {
    1.59 +    *dest++ = *src1++ + *src2++;
    1.60 +  }
    1.61 +}
    1.62 +OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
    1.63 +
    1.64 +SSE_FUNCTION static void
    1.65 +add_f64_sse2 (double *dest, double *src1, double *src2, int n)
    1.66 +{
    1.67 +  __m128d xmm0, xmm1;
    1.68 +  while (((long)dest & 15) && (0 < n)) {
    1.69 +    *dest++ = *src1++ + *src2++;
    1.70 +    n--;
    1.71 +  }
    1.72 +  while (1 < n) {
    1.73 +    xmm0 = _mm_loadu_pd(src1);
    1.74 +    xmm1 = _mm_loadu_pd(src2);
    1.75 +    xmm0 = _mm_add_pd(xmm0, xmm1);
    1.76 +    _mm_store_pd(dest, xmm0);
    1.77 +    dest += 2;
    1.78 +    src1 += 2;
    1.79 +    src2 += 2;
    1.80 +    n -= 2;
    1.81 +  }
    1.82 +  while (0 < n) {
    1.83 +    *dest++ = *src1++ + *src2++;
    1.84 +    n--;
    1.85 +  }
    1.86 +}
    1.87 +OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
    1.88 +
    1.89 +SSE_FUNCTION static void
    1.90 +add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
    1.91 +{
    1.92 +  __m128d xmm0, xmm1;
    1.93 +  while (((long)dest & 15) && (0 < n)) {
    1.94 +    *dest++ = *src1++ + *src2++;
    1.95 +    n--;
    1.96 +  }
    1.97 +  while (3 < n) {
    1.98 +    xmm0 = _mm_loadu_pd(src1);
    1.99 +    xmm1 = _mm_loadu_pd(src2);
   1.100 +    xmm0 = _mm_add_pd(xmm0, xmm1);
   1.101 +    _mm_store_pd(dest, xmm0);
   1.102 +
   1.103 +    xmm0 = _mm_loadu_pd(src1+2);
   1.104 +    xmm1 = _mm_loadu_pd(src2+2);
   1.105 +    xmm0 = _mm_add_pd(xmm0, xmm1);
   1.106 +    _mm_store_pd(dest+2, xmm0);
   1.107 +    dest += 4;
   1.108 +    src1 += 4;
   1.109 +    src2 += 4;
   1.110 +    n -= 4;
   1.111 +  }
   1.112 +  while (1 < n) {
   1.113 +    xmm0 = _mm_loadu_pd(src1);
   1.114 +    xmm1 = _mm_loadu_pd(src2);
   1.115 +    xmm0 = _mm_add_pd(xmm0, xmm1);
   1.116 +    _mm_store_pd(dest, xmm0);
   1.117 +    dest += 2;
   1.118 +    src1 += 2;
   1.119 +    src2 += 2;
   1.120 +    n -= 2;
   1.121 +  }
   1.122 +  while (0 < n) {
   1.123 +    *dest++ = *src1++ + *src2++;
   1.124 +    n--;
   1.125 +  }
   1.126 +}
   1.127 +OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
   1.128 +
   1.129 +SSE_FUNCTION static void
   1.130 +subtract_f32_sse (float *dest, float *src1, float *src2, int n)
   1.131 +{
   1.132 +  /* Initial operations to align the destination pointer */
   1.133 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.134 +    *dest++ = *src1++ - *src2++;
   1.135 +  }
   1.136 +  for (; n >= 4; n -= 4) {
   1.137 +    __m128 xmm0, xmm1;
   1.138 +    xmm0 = _mm_loadu_ps(src1);
   1.139 +    xmm1 = _mm_loadu_ps(src2);
   1.140 +    xmm0 = _mm_sub_ps(xmm0, xmm1);
   1.141 +    _mm_store_ps(dest, xmm0);
   1.142 +    dest += 4;
   1.143 +    src1 += 4;
   1.144 +    src2 += 4;
   1.145 +  }
   1.146 +  for (; n > 0; n--) {
   1.147 +    *dest++ = *src1++ - *src2++;
   1.148 +  }
   1.149 +}
   1.150 +OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
   1.151 +
   1.152 +SSE_FUNCTION static void
   1.153 +multiply_f32_sse (float *dest, float *src1, float *src2, int n)
   1.154 +{
   1.155 +  /* Initial operations to align the destination pointer */
   1.156 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.157 +    *dest++ = *src1++ * *src2++;
   1.158 +  }
   1.159 +  for (; n >= 4; n -= 4) {
   1.160 +    __m128 xmm0, xmm1;
   1.161 +    xmm0 = _mm_loadu_ps(src1);
   1.162 +    xmm1 = _mm_loadu_ps(src2);
   1.163 +    xmm0 = _mm_mul_ps(xmm0, xmm1);
   1.164 +    _mm_store_ps(dest, xmm0);
   1.165 +    dest += 4;
   1.166 +    src1 += 4;
   1.167 +    src2 += 4;
   1.168 +  }
   1.169 +  for (; n > 0; n--) {
   1.170 +    *dest++ = *src1++ * *src2++;
   1.171 +  }
   1.172 +}
   1.173 +OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
   1.174 +
   1.175 +SSE_FUNCTION static void
   1.176 +divide_f32_sse (float *dest, float *src1, float *src2, int n)
   1.177 +{
   1.178 +  /* Initial operations to align the destination pointer */
   1.179 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.180 +    *dest++ = *src1++ / *src2++;
   1.181 +  }
   1.182 +  for (; n >= 4; n -= 4) {
   1.183 +    __m128 xmm0, xmm1;
   1.184 +    xmm0 = _mm_loadu_ps(src1);
   1.185 +    xmm1 = _mm_loadu_ps(src2);
   1.186 +    xmm0 = _mm_div_ps(xmm0, xmm1);
   1.187 +    _mm_store_ps(dest, xmm0);
   1.188 +    dest += 4;
   1.189 +    src1 += 4;
   1.190 +    src2 += 4;
   1.191 +  }
   1.192 +  for (; n > 0; n--) {
   1.193 +    *dest++ = *src1++ / *src2++;
   1.194 +  }
   1.195 +}
   1.196 +OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
   1.197 +
   1.198 +SSE_FUNCTION static void
   1.199 +minimum_f32_sse (float *dest, float *src1, float *src2, int n)
   1.200 +{
   1.201 +  /* Initial operations to align the destination pointer */
   1.202 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.203 +    *dest++ = *src1 < *src2 ? *src1 : *src2;
   1.204 +    src1++;
   1.205 +    src2++;
   1.206 +  }
   1.207 +  for (; n >= 4; n -= 4) {
   1.208 +    __m128 xmm0, xmm1;
   1.209 +    xmm0 = _mm_loadu_ps(src1);
   1.210 +    xmm1 = _mm_loadu_ps(src2);
   1.211 +    xmm0 = _mm_min_ps(xmm0, xmm1);
   1.212 +    _mm_store_ps(dest, xmm0);
   1.213 +    dest += 4;
   1.214 +    src1 += 4;
   1.215 +    src2 += 4;
   1.216 +  }
   1.217 +  for (; n > 0; n--) {
   1.218 +    *dest++ = *src1 < *src2 ? *src1 : *src2;
   1.219 +    src1++;
   1.220 +    src2++;
   1.221 +  }
   1.222 +}
   1.223 +OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
   1.224 +
   1.225 +SSE_FUNCTION static void
   1.226 +maximum_f32_sse (float *dest, float *src1, float *src2, int n)
   1.227 +{
   1.228 +  /* Initial operations to align the destination pointer */
   1.229 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.230 +    *dest++ = *src1 > *src2 ? *src1 : *src2;
   1.231 +    src1++;
   1.232 +    src2++;
   1.233 +  }
   1.234 +  for (; n >= 4; n -= 4) {
   1.235 +    __m128 xmm0, xmm1;
   1.236 +    xmm0 = _mm_loadu_ps(src1);
   1.237 +    xmm1 = _mm_loadu_ps(src2);
   1.238 +    xmm0 = _mm_max_ps(xmm0, xmm1);
   1.239 +    _mm_store_ps(dest, xmm0);
   1.240 +    dest += 4;
   1.241 +    src1 += 4;
   1.242 +    src2 += 4;
   1.243 +  }
   1.244 +  for (; n > 0; n--) {
   1.245 +    *dest++ = *src1 > *src2 ? *src1 : *src2;
   1.246 +    src1++;
   1.247 +    src2++;
   1.248 +  }
   1.249 +}
   1.250 +OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
   1.251 +
   1.252 +SSE_FUNCTION static void
   1.253 +inverse_f32_sse (float *dest, float *src1, int n)
   1.254 +{
   1.255 +  /* Initial operations to align the destination pointer */
   1.256 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.257 +    *dest++ = 1.0 / *src1++;
   1.258 +  }
   1.259 +  for (; n >= 4; n -= 4) {
   1.260 +    __m128 xmm0, xmm1;
   1.261 +    /* While _mm_rcp_ps sounds promising, the results it gives are rather
   1.262 +     * different from the 1.0 / src1 reference implementation, so do that.
   1.263 +     */
   1.264 +    xmm0 = _mm_set_ps1(1.0);
   1.265 +    xmm1 = _mm_loadu_ps(src1);
   1.266 +    xmm0 = _mm_div_ps(xmm0, xmm1);
   1.267 +    _mm_store_ps(dest, xmm0);
   1.268 +    dest += 4;
   1.269 +    src1 += 4;
   1.270 +  }
   1.271 +  for (; n > 0; n--) {
   1.272 +    *dest++ = 1.0 / *src1++;
   1.273 +  }
   1.274 +}
   1.275 +OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
   1.276 +
   1.277 +SSE_FUNCTION static void
   1.278 +negative_f32_sse (float *dest, float *src1, int n)
   1.279 +{
   1.280 +  /* Initial operations to align the destination pointer */
   1.281 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.282 +    *dest++ = -(*src1++);
   1.283 +  }
   1.284 +  for (; n >= 4; n -= 4) {
   1.285 +    __m128 xmm0, xmm1;
   1.286 +    xmm0 = _mm_setzero_ps();
   1.287 +    xmm1 = _mm_loadu_ps(src1);
   1.288 +    xmm0 = _mm_sub_ps(xmm0, xmm1);
   1.289 +    _mm_store_ps(dest, xmm0);
   1.290 +    dest += 4;
   1.291 +    src1 += 4;
   1.292 +  }
   1.293 +  for (; n > 0; n--) {
   1.294 +    *dest++ = -(*src1++);
   1.295 +  }
   1.296 +}
   1.297 +OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
   1.298 +
   1.299 +SSE_FUNCTION static void
   1.300 +scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
   1.301 +{
   1.302 +  __m128 xmm1;
   1.303 +
   1.304 +  /* Initial operations to align the destination pointer */
   1.305 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.306 +    *dest++ = *src1++ + *val;
   1.307 +  }
   1.308 +  xmm1 = _mm_load_ps1(val);
   1.309 +  for (; n >= 4; n -= 4) {
   1.310 +    __m128 xmm0;
   1.311 +    xmm0 = _mm_loadu_ps(src1);
   1.312 +    xmm0 = _mm_add_ps(xmm0, xmm1);
   1.313 +    _mm_store_ps(dest, xmm0);
   1.314 +    dest += 4;
   1.315 +    src1 += 4;
   1.316 +  }
   1.317 +  for (; n > 0; n--) {
   1.318 +    *dest++ = *src1++ + *val;
   1.319 +  }
   1.320 +}
   1.321 +OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
   1.322 +
   1.323 +SSE_FUNCTION static void
   1.324 +scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
   1.325 +{
   1.326 +  __m128 xmm1;
   1.327 +
   1.328 +  /* Initial operations to align the destination pointer */
   1.329 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.330 +    *dest++ = *src1++ * *val;
   1.331 +  }
   1.332 +  xmm1 = _mm_load_ps1(val);
   1.333 +  for (; n >= 4; n -= 4) {
   1.334 +    __m128 xmm0;
   1.335 +    xmm0 = _mm_loadu_ps(src1);
   1.336 +    xmm0 = _mm_mul_ps(xmm0, xmm1);
   1.337 +    _mm_store_ps(dest, xmm0);
   1.338 +    dest += 4;
   1.339 +    src1 += 4;
   1.340 +  }
   1.341 +  for (; n > 0; n--) {
   1.342 +    *dest++ = *src1++ * *val;
   1.343 +  }
   1.344 +}
   1.345 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
   1.346 +
   1.347 +SSE_FUNCTION static void
   1.348 +scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
   1.349 +{
   1.350 +  __m128d xmm1;
   1.351 +
   1.352 +  /* Initial operations to align the destination pointer */
   1.353 +  for (; ((long)dest & 15) && (n > 0); n--) {
   1.354 +    *dest++ = *src1++ * *val;
   1.355 +  }
   1.356 +  xmm1 = _mm_load_pd1(val);
   1.357 +  for (; n >= 2; n -= 2) {
   1.358 +    __m128d xmm0;
   1.359 +    xmm0 = _mm_loadu_pd(src1);
   1.360 +    xmm0 = _mm_mul_pd(xmm0, xmm1);
   1.361 +    _mm_store_pd(dest, xmm0);
   1.362 +    dest += 2;
   1.363 +    src1 += 2;
   1.364 +  }
   1.365 +  for (; n > 0; n--) {
   1.366 +    *dest++ = *src1++ * *val;
   1.367 +  }
   1.368 +}
   1.369 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
   1.370 +
   1.371 +
   1.372 +
   1.373 +#ifdef	__SYMBIAN32__
   1.374 + 
   1.375 +OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() {
   1.376 +		return &_oil_function_impl_add_f32_sse, add_f32;
   1.377 +}
   1.378 +#endif
   1.379 +
   1.380 +#ifdef	__SYMBIAN32__
   1.381 + 
   1.382 +OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() {
   1.383 +		return &_oil_function_impl_add_f64_sse2, add_f64;
   1.384 +}
   1.385 +#endif
   1.386 +
   1.387 +#ifdef	__SYMBIAN32__
   1.388 + 
   1.389 +OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() {
   1.390 +		return &_oil_function_impl_add_f64_sse2_unroll, add_f64;
   1.391 +}
   1.392 +#endif
   1.393 +
   1.394 +#ifdef	__SYMBIAN32__
   1.395 + 
   1.396 +OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() {
   1.397 +		return &_oil_function_impl_subtract_f32_sse, subtract_f32;
   1.398 +}
   1.399 +#endif
   1.400 +
   1.401 +#ifdef	__SYMBIAN32__
   1.402 + 
   1.403 +OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() {
   1.404 +		return &_oil_function_impl_multiply_f32_sse, multiply_f32;
   1.405 +}
   1.406 +#endif
   1.407 +
   1.408 +#ifdef	__SYMBIAN32__
   1.409 + 
   1.410 +OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() {
   1.411 +		return &_oil_function_impl_divide_f32_sse, divide_f32;
   1.412 +}
   1.413 +#endif
   1.414 +
   1.415 +#ifdef	__SYMBIAN32__
   1.416 + 
   1.417 +OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() {
   1.418 +		return &_oil_function_impl_minimum_f32_sse, minimum_f32;
   1.419 +}
   1.420 +#endif
   1.421 +
   1.422 +#ifdef	__SYMBIAN32__
   1.423 + 
   1.424 +OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() {
   1.425 +		return &_oil_function_impl_maximum_f32_sse, maximum_f32;
   1.426 +}
   1.427 +#endif
   1.428 +
   1.429 +#ifdef	__SYMBIAN32__
   1.430 + 
   1.431 +OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() {
   1.432 +		return &_oil_function_impl_inverse_f32_sse, inverse_f32;
   1.433 +}
   1.434 +#endif
   1.435 +
   1.436 +#ifdef	__SYMBIAN32__
   1.437 + 
   1.438 +OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() {
   1.439 +		return &_oil_function_impl_negative_f32_sse, negative_f32;
   1.440 +}
   1.441 +#endif
   1.442 +
   1.443 +#ifdef	__SYMBIAN32__
   1.444 + 
   1.445 +OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() {
   1.446 +		return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns;
   1.447 +}
   1.448 +#endif
   1.449 +
   1.450 +#ifdef	__SYMBIAN32__
   1.451 + 
   1.452 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() {
   1.453 +		return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns;
   1.454 +}
   1.455 +#endif
   1.456 +
   1.457 +#ifdef	__SYMBIAN32__
   1.458 + 
   1.459 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() {
   1.460 +		return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns;
   1.461 +}
   1.462 +#endif
   1.463 +