1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/math_sse.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,460 @@
1.4 +/*
1.5 + * Copyright (c) 2005
1.6 + * Eric Anholt. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.30 +
1.31 +#ifdef HAVE_CONFIG_H
1.32 +#include "config.h"
1.33 +#endif
1.34 +#include <liboilclasses.h>
1.35 +#include <liboilfunction.h>
1.36 +#include <emmintrin.h>
1.37 +#include <xmmintrin.h>
1.38 +
1.39 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
1.40 +
1.41 +SSE_FUNCTION static void
1.42 +add_f32_sse (float *dest, float *src1, float *src2, int n)
1.43 +{
1.44 + /* Initial operations to align the destination pointer */
1.45 + for (; ((long)dest & 15) && (n > 0); n--) {
1.46 + *dest++ = *src1++ + *src2++;
1.47 + }
1.48 + for (; n >= 4; n -= 4) {
1.49 + __m128 xmm0, xmm1;
1.50 + xmm0 = _mm_loadu_ps(src1);
1.51 + xmm1 = _mm_loadu_ps(src2);
1.52 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.53 + _mm_store_ps(dest, xmm0);
1.54 + dest += 4;
1.55 + src1 += 4;
1.56 + src2 += 4;
1.57 + }
1.58 + for (; n > 0; n--) {
1.59 + *dest++ = *src1++ + *src2++;
1.60 + }
1.61 +}
1.62 +OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
1.63 +
1.64 +SSE_FUNCTION static void
1.65 +add_f64_sse2 (double *dest, double *src1, double *src2, int n)
1.66 +{
1.67 + __m128d xmm0, xmm1;
1.68 + while (((long)dest & 15) && (0 < n)) {
1.69 + *dest++ = *src1++ + *src2++;
1.70 + n--;
1.71 + }
1.72 + while (1 < n) {
1.73 + xmm0 = _mm_loadu_pd(src1);
1.74 + xmm1 = _mm_loadu_pd(src2);
1.75 + xmm0 = _mm_add_pd(xmm0, xmm1);
1.76 + _mm_store_pd(dest, xmm0);
1.77 + dest += 2;
1.78 + src1 += 2;
1.79 + src2 += 2;
1.80 + n -= 2;
1.81 + }
1.82 + while (0 < n) {
1.83 + *dest++ = *src1++ + *src2++;
1.84 + n--;
1.85 + }
1.86 +}
1.87 +OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
1.88 +
1.89 +SSE_FUNCTION static void
1.90 +add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
1.91 +{
1.92 + __m128d xmm0, xmm1;
1.93 + while (((long)dest & 15) && (0 < n)) {
1.94 + *dest++ = *src1++ + *src2++;
1.95 + n--;
1.96 + }
1.97 + while (3 < n) {
1.98 + xmm0 = _mm_loadu_pd(src1);
1.99 + xmm1 = _mm_loadu_pd(src2);
1.100 + xmm0 = _mm_add_pd(xmm0, xmm1);
1.101 + _mm_store_pd(dest, xmm0);
1.102 +
1.103 + xmm0 = _mm_loadu_pd(src1+2);
1.104 + xmm1 = _mm_loadu_pd(src2+2);
1.105 + xmm0 = _mm_add_pd(xmm0, xmm1);
1.106 + _mm_store_pd(dest+2, xmm0);
1.107 + dest += 4;
1.108 + src1 += 4;
1.109 + src2 += 4;
1.110 + n -= 4;
1.111 + }
1.112 + while (1 < n) {
1.113 + xmm0 = _mm_loadu_pd(src1);
1.114 + xmm1 = _mm_loadu_pd(src2);
1.115 + xmm0 = _mm_add_pd(xmm0, xmm1);
1.116 + _mm_store_pd(dest, xmm0);
1.117 + dest += 2;
1.118 + src1 += 2;
1.119 + src2 += 2;
1.120 + n -= 2;
1.121 + }
1.122 + while (0 < n) {
1.123 + *dest++ = *src1++ + *src2++;
1.124 + n--;
1.125 + }
1.126 +}
1.127 +OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
1.128 +
1.129 +SSE_FUNCTION static void
1.130 +subtract_f32_sse (float *dest, float *src1, float *src2, int n)
1.131 +{
1.132 + /* Initial operations to align the destination pointer */
1.133 + for (; ((long)dest & 15) && (n > 0); n--) {
1.134 + *dest++ = *src1++ - *src2++;
1.135 + }
1.136 + for (; n >= 4; n -= 4) {
1.137 + __m128 xmm0, xmm1;
1.138 + xmm0 = _mm_loadu_ps(src1);
1.139 + xmm1 = _mm_loadu_ps(src2);
1.140 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.141 + _mm_store_ps(dest, xmm0);
1.142 + dest += 4;
1.143 + src1 += 4;
1.144 + src2 += 4;
1.145 + }
1.146 + for (; n > 0; n--) {
1.147 + *dest++ = *src1++ - *src2++;
1.148 + }
1.149 +}
1.150 +OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
1.151 +
1.152 +SSE_FUNCTION static void
1.153 +multiply_f32_sse (float *dest, float *src1, float *src2, int n)
1.154 +{
1.155 + /* Initial operations to align the destination pointer */
1.156 + for (; ((long)dest & 15) && (n > 0); n--) {
1.157 + *dest++ = *src1++ * *src2++;
1.158 + }
1.159 + for (; n >= 4; n -= 4) {
1.160 + __m128 xmm0, xmm1;
1.161 + xmm0 = _mm_loadu_ps(src1);
1.162 + xmm1 = _mm_loadu_ps(src2);
1.163 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.164 + _mm_store_ps(dest, xmm0);
1.165 + dest += 4;
1.166 + src1 += 4;
1.167 + src2 += 4;
1.168 + }
1.169 + for (; n > 0; n--) {
1.170 + *dest++ = *src1++ * *src2++;
1.171 + }
1.172 +}
1.173 +OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
1.174 +
1.175 +SSE_FUNCTION static void
1.176 +divide_f32_sse (float *dest, float *src1, float *src2, int n)
1.177 +{
1.178 + /* Initial operations to align the destination pointer */
1.179 + for (; ((long)dest & 15) && (n > 0); n--) {
1.180 + *dest++ = *src1++ / *src2++;
1.181 + }
1.182 + for (; n >= 4; n -= 4) {
1.183 + __m128 xmm0, xmm1;
1.184 + xmm0 = _mm_loadu_ps(src1);
1.185 + xmm1 = _mm_loadu_ps(src2);
1.186 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.187 + _mm_store_ps(dest, xmm0);
1.188 + dest += 4;
1.189 + src1 += 4;
1.190 + src2 += 4;
1.191 + }
1.192 + for (; n > 0; n--) {
1.193 + *dest++ = *src1++ / *src2++;
1.194 + }
1.195 +}
1.196 +OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
1.197 +
1.198 +SSE_FUNCTION static void
1.199 +minimum_f32_sse (float *dest, float *src1, float *src2, int n)
1.200 +{
1.201 + /* Initial operations to align the destination pointer */
1.202 + for (; ((long)dest & 15) && (n > 0); n--) {
1.203 + *dest++ = *src1 < *src2 ? *src1 : *src2;
1.204 + src1++;
1.205 + src2++;
1.206 + }
1.207 + for (; n >= 4; n -= 4) {
1.208 + __m128 xmm0, xmm1;
1.209 + xmm0 = _mm_loadu_ps(src1);
1.210 + xmm1 = _mm_loadu_ps(src2);
1.211 + xmm0 = _mm_min_ps(xmm0, xmm1);
1.212 + _mm_store_ps(dest, xmm0);
1.213 + dest += 4;
1.214 + src1 += 4;
1.215 + src2 += 4;
1.216 + }
1.217 + for (; n > 0; n--) {
1.218 + *dest++ = *src1 < *src2 ? *src1 : *src2;
1.219 + src1++;
1.220 + src2++;
1.221 + }
1.222 +}
1.223 +OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
1.224 +
1.225 +SSE_FUNCTION static void
1.226 +maximum_f32_sse (float *dest, float *src1, float *src2, int n)
1.227 +{
1.228 + /* Initial operations to align the destination pointer */
1.229 + for (; ((long)dest & 15) && (n > 0); n--) {
1.230 + *dest++ = *src1 > *src2 ? *src1 : *src2;
1.231 + src1++;
1.232 + src2++;
1.233 + }
1.234 + for (; n >= 4; n -= 4) {
1.235 + __m128 xmm0, xmm1;
1.236 + xmm0 = _mm_loadu_ps(src1);
1.237 + xmm1 = _mm_loadu_ps(src2);
1.238 + xmm0 = _mm_max_ps(xmm0, xmm1);
1.239 + _mm_store_ps(dest, xmm0);
1.240 + dest += 4;
1.241 + src1 += 4;
1.242 + src2 += 4;
1.243 + }
1.244 + for (; n > 0; n--) {
1.245 + *dest++ = *src1 > *src2 ? *src1 : *src2;
1.246 + src1++;
1.247 + src2++;
1.248 + }
1.249 +}
1.250 +OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
1.251 +
1.252 +SSE_FUNCTION static void
1.253 +inverse_f32_sse (float *dest, float *src1, int n)
1.254 +{
1.255 + /* Initial operations to align the destination pointer */
1.256 + for (; ((long)dest & 15) && (n > 0); n--) {
1.257 + *dest++ = 1.0 / *src1++;
1.258 + }
1.259 + for (; n >= 4; n -= 4) {
1.260 + __m128 xmm0, xmm1;
1.261 + /* While _mm_rcp_ps sounds promising, the results it gives are rather
1.262 + * different from the 1.0 / src1 reference implementation, so do that.
1.263 + */
1.264 + xmm0 = _mm_set_ps1(1.0);
1.265 + xmm1 = _mm_loadu_ps(src1);
1.266 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.267 + _mm_store_ps(dest, xmm0);
1.268 + dest += 4;
1.269 + src1 += 4;
1.270 + }
1.271 + for (; n > 0; n--) {
1.272 + *dest++ = 1.0 / *src1++;
1.273 + }
1.274 +}
1.275 +OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
1.276 +
1.277 +SSE_FUNCTION static void
1.278 +negative_f32_sse (float *dest, float *src1, int n)
1.279 +{
1.280 + /* Initial operations to align the destination pointer */
1.281 + for (; ((long)dest & 15) && (n > 0); n--) {
1.282 + *dest++ = -(*src1++);
1.283 + }
1.284 + for (; n >= 4; n -= 4) {
1.285 + __m128 xmm0, xmm1;
1.286 + xmm0 = _mm_setzero_ps();
1.287 + xmm1 = _mm_loadu_ps(src1);
1.288 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.289 + _mm_store_ps(dest, xmm0);
1.290 + dest += 4;
1.291 + src1 += 4;
1.292 + }
1.293 + for (; n > 0; n--) {
1.294 + *dest++ = -(*src1++);
1.295 + }
1.296 +}
1.297 +OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
1.298 +
1.299 +SSE_FUNCTION static void
1.300 +scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
1.301 +{
1.302 + __m128 xmm1;
1.303 +
1.304 + /* Initial operations to align the destination pointer */
1.305 + for (; ((long)dest & 15) && (n > 0); n--) {
1.306 + *dest++ = *src1++ + *val;
1.307 + }
1.308 + xmm1 = _mm_load_ps1(val);
1.309 + for (; n >= 4; n -= 4) {
1.310 + __m128 xmm0;
1.311 + xmm0 = _mm_loadu_ps(src1);
1.312 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.313 + _mm_store_ps(dest, xmm0);
1.314 + dest += 4;
1.315 + src1 += 4;
1.316 + }
1.317 + for (; n > 0; n--) {
1.318 + *dest++ = *src1++ + *val;
1.319 + }
1.320 +}
1.321 +OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
1.322 +
1.323 +SSE_FUNCTION static void
1.324 +scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
1.325 +{
1.326 + __m128 xmm1;
1.327 +
1.328 + /* Initial operations to align the destination pointer */
1.329 + for (; ((long)dest & 15) && (n > 0); n--) {
1.330 + *dest++ = *src1++ * *val;
1.331 + }
1.332 + xmm1 = _mm_load_ps1(val);
1.333 + for (; n >= 4; n -= 4) {
1.334 + __m128 xmm0;
1.335 + xmm0 = _mm_loadu_ps(src1);
1.336 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.337 + _mm_store_ps(dest, xmm0);
1.338 + dest += 4;
1.339 + src1 += 4;
1.340 + }
1.341 + for (; n > 0; n--) {
1.342 + *dest++ = *src1++ * *val;
1.343 + }
1.344 +}
1.345 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
1.346 +
1.347 +SSE_FUNCTION static void
1.348 +scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
1.349 +{
1.350 + __m128d xmm1;
1.351 +
1.352 + /* Initial operations to align the destination pointer */
1.353 + for (; ((long)dest & 15) && (n > 0); n--) {
1.354 + *dest++ = *src1++ * *val;
1.355 + }
1.356 + xmm1 = _mm_load_pd1(val);
1.357 + for (; n >= 2; n -= 2) {
1.358 + __m128d xmm0;
1.359 + xmm0 = _mm_loadu_pd(src1);
1.360 + xmm0 = _mm_mul_pd(xmm0, xmm1);
1.361 + _mm_store_pd(dest, xmm0);
1.362 + dest += 2;
1.363 + src1 += 2;
1.364 + }
1.365 + for (; n > 0; n--) {
1.366 + *dest++ = *src1++ * *val;
1.367 + }
1.368 +}
1.369 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
1.370 +
1.371 +
1.372 +
1.373 +#ifdef __SYMBIAN32__
1.374 +
1.375 +OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() {
1.376 + return &_oil_function_impl_add_f32_sse, add_f32;
1.377 +}
1.378 +#endif
1.379 +
1.380 +#ifdef __SYMBIAN32__
1.381 +
1.382 +OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() {
1.383 + return &_oil_function_impl_add_f64_sse2, add_f64;
1.384 +}
1.385 +#endif
1.386 +
1.387 +#ifdef __SYMBIAN32__
1.388 +
1.389 +OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() {
1.390 + return &_oil_function_impl_add_f64_sse2_unroll, add_f64;
1.391 +}
1.392 +#endif
1.393 +
1.394 +#ifdef __SYMBIAN32__
1.395 +
1.396 +OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() {
1.397 + return &_oil_function_impl_subtract_f32_sse, subtract_f32;
1.398 +}
1.399 +#endif
1.400 +
1.401 +#ifdef __SYMBIAN32__
1.402 +
1.403 +OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() {
1.404 + return &_oil_function_impl_multiply_f32_sse, multiply_f32;
1.405 +}
1.406 +#endif
1.407 +
1.408 +#ifdef __SYMBIAN32__
1.409 +
1.410 +OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() {
1.411 + return &_oil_function_impl_divide_f32_sse, divide_f32;
1.412 +}
1.413 +#endif
1.414 +
1.415 +#ifdef __SYMBIAN32__
1.416 +
1.417 +OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() {
1.418 + return &_oil_function_impl_minimum_f32_sse, minimum_f32;
1.419 +}
1.420 +#endif
1.421 +
1.422 +#ifdef __SYMBIAN32__
1.423 +
1.424 +OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() {
1.425 + return &_oil_function_impl_maximum_f32_sse, maximum_f32;
1.426 +}
1.427 +#endif
1.428 +
1.429 +#ifdef __SYMBIAN32__
1.430 +
1.431 +OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() {
1.432 + return &_oil_function_impl_inverse_f32_sse, inverse_f32;
1.433 +}
1.434 +#endif
1.435 +
1.436 +#ifdef __SYMBIAN32__
1.437 +
1.438 +OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() {
1.439 + return &_oil_function_impl_negative_f32_sse, negative_f32;
1.440 +}
1.441 +#endif
1.442 +
1.443 +#ifdef __SYMBIAN32__
1.444 +
1.445 +OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() {
1.446 + return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns;
1.447 +}
1.448 +#endif
1.449 +
1.450 +#ifdef __SYMBIAN32__
1.451 +
1.452 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() {
1.453 + return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns;
1.454 +}
1.455 +#endif
1.456 +
1.457 +#ifdef __SYMBIAN32__
1.458 +
1.459 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() {
1.460 + return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns;
1.461 +}
1.462 +#endif
1.463 +