1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/math_sse_unroll2.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,422 @@
1.4 +/*
1.5 + * Copyright (c) 2005
1.6 + * Eric Anholt. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.30 +
1.31 +#ifdef HAVE_CONFIG_H
1.32 +#include "config.h"
1.33 +#endif
1.34 +#include <liboil/liboilclasses.h>
1.35 +#include <liboil/liboilfunction.h>
1.36 +#include <emmintrin.h>
1.37 +#include <xmmintrin.h>
1.38 +
1.39 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
1.40 +
1.41 +SSE_FUNCTION static void
1.42 +add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.43 +{
1.44 + /* Initial operations to align the destination pointer */
1.45 + for (; ((long)dest & 15) && (n > 0); n--) {
1.46 + *dest++ = *src1++ + *src2++;
1.47 + }
1.48 + for (; n >= 8; n -= 8) {
1.49 + __m128 xmm0, xmm1;
1.50 + xmm0 = _mm_loadu_ps(src1);
1.51 + xmm1 = _mm_loadu_ps(src2);
1.52 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.53 + _mm_store_ps(dest, xmm0);
1.54 + xmm0 = _mm_loadu_ps(src1 + 4);
1.55 + xmm1 = _mm_loadu_ps(src2 + 4);
1.56 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.57 + _mm_store_ps(dest + 4, xmm0);
1.58 + dest += 8;
1.59 + src1 += 8;
1.60 + src2 += 8;
1.61 + }
1.62 + for (; n > 0; n--) {
1.63 + *dest++ = *src1++ + *src2++;
1.64 + }
1.65 +}
1.66 +OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
1.67 +
1.68 +SSE_FUNCTION static void
1.69 +subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.70 +{
1.71 + /* Initial operations to align the destination pointer */
1.72 + for (; ((long)dest & 15) && (n > 0); n--) {
1.73 + *dest++ = *src1++ - *src2++;
1.74 + }
1.75 + for (; n >= 8; n -= 8) {
1.76 + __m128 xmm0, xmm1;
1.77 + xmm0 = _mm_loadu_ps(src1);
1.78 + xmm1 = _mm_loadu_ps(src2);
1.79 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.80 + _mm_store_ps(dest, xmm0);
1.81 + xmm0 = _mm_loadu_ps(src1 + 4);
1.82 + xmm1 = _mm_loadu_ps(src2 + 4);
1.83 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.84 + _mm_store_ps(dest + 4, xmm0);
1.85 + dest += 8;
1.86 + src1 += 8;
1.87 + src2 += 8;
1.88 + }
1.89 + for (; n > 0; n--) {
1.90 + *dest++ = *src1++ - *src2++;
1.91 + }
1.92 +}
1.93 +OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
1.94 +
1.95 +SSE_FUNCTION static void
1.96 +multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.97 +{
1.98 + /* Initial operations to align the destination pointer */
1.99 + for (; ((long)dest & 15) && (n > 0); n--) {
1.100 + *dest++ = *src1++ * *src2++;
1.101 + }
1.102 + for (; n >= 8; n -= 8) {
1.103 + __m128 xmm0, xmm1;
1.104 + xmm0 = _mm_loadu_ps(src1);
1.105 + xmm1 = _mm_loadu_ps(src2);
1.106 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.107 + _mm_store_ps(dest, xmm0);
1.108 + xmm0 = _mm_loadu_ps(src1 + 4);
1.109 + xmm1 = _mm_loadu_ps(src2 + 4);
1.110 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.111 + _mm_store_ps(dest + 4, xmm0);
1.112 + dest += 8;
1.113 + src1 += 8;
1.114 + src2 += 8;
1.115 + }
1.116 + for (; n > 0; n--) {
1.117 + *dest++ = *src1++ * *src2++;
1.118 + }
1.119 +}
1.120 +OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
1.121 +
1.122 +SSE_FUNCTION static void
1.123 +divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.124 +{
1.125 + /* Initial operations to align the destination pointer */
1.126 + for (; ((long)dest & 15) && (n > 0); n--) {
1.127 + *dest++ = *src1++ / *src2++;
1.128 + }
1.129 + for (; n >= 8; n -= 8) {
1.130 + __m128 xmm0, xmm1;
1.131 + xmm0 = _mm_loadu_ps(src1);
1.132 + xmm1 = _mm_loadu_ps(src2);
1.133 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.134 + _mm_store_ps(dest, xmm0);
1.135 + xmm0 = _mm_loadu_ps(src1 + 4);
1.136 + xmm1 = _mm_loadu_ps(src2 + 4);
1.137 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.138 + _mm_store_ps(dest + 4, xmm0);
1.139 + dest += 8;
1.140 + src1 += 8;
1.141 + src2 += 8;
1.142 + }
1.143 + for (; n > 0; n--) {
1.144 + *dest++ = *src1++ / *src2++;
1.145 + }
1.146 +}
1.147 +OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
1.148 +
1.149 +SSE_FUNCTION static void
1.150 +minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.151 +{
1.152 + /* Initial operations to align the destination pointer */
1.153 + for (; ((long)dest & 15) && (n > 0); n--) {
1.154 + *dest++ = *src1 < *src2 ? *src1 : *src2;
1.155 + src1++;
1.156 + src2++;
1.157 + }
1.158 + for (; n >= 8; n -= 8) {
1.159 + __m128 xmm0, xmm1;
1.160 + xmm0 = _mm_loadu_ps(src1);
1.161 + xmm1 = _mm_loadu_ps(src2);
1.162 + xmm0 = _mm_min_ps(xmm0, xmm1);
1.163 + _mm_store_ps(dest, xmm0);
1.164 + xmm0 = _mm_loadu_ps(src1 + 4);
1.165 + xmm1 = _mm_loadu_ps(src2 + 4);
1.166 + xmm0 = _mm_min_ps(xmm0, xmm1);
1.167 + _mm_store_ps(dest + 4, xmm0);
1.168 + dest += 8;
1.169 + src1 += 8;
1.170 + src2 += 8;
1.171 + }
1.172 + for (; n > 0; n--) {
1.173 + *dest++ = *src1 < *src2 ? *src1 : *src2;
1.174 + src1++;
1.175 + src2++;
1.176 + }
1.177 +}
1.178 +OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
1.179 +
1.180 +SSE_FUNCTION static void
1.181 +maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
1.182 +{
1.183 + /* Initial operations to align the destination pointer */
1.184 + for (; ((long)dest & 15) && (n > 0); n--) {
1.185 + *dest++ = *src1 > *src2 ? *src1 : *src2;
1.186 + src1++;
1.187 + src2++;
1.188 + }
1.189 + for (; n >= 8; n -= 8) {
1.190 + __m128 xmm0, xmm1;
1.191 + xmm0 = _mm_loadu_ps(src1);
1.192 + xmm1 = _mm_loadu_ps(src2);
1.193 + xmm0 = _mm_max_ps(xmm0, xmm1);
1.194 + _mm_store_ps(dest, xmm0);
1.195 + xmm0 = _mm_loadu_ps(src1 + 4);
1.196 + xmm1 = _mm_loadu_ps(src2 + 4);
1.197 + xmm0 = _mm_max_ps(xmm0, xmm1);
1.198 + _mm_store_ps(dest + 4, xmm0);
1.199 + dest += 8;
1.200 + src1 += 8;
1.201 + src2 += 8;
1.202 + }
1.203 + for (; n > 0; n--) {
1.204 + *dest++ = *src1 > *src2 ? *src1 : *src2;
1.205 + src1++;
1.206 + src2++;
1.207 + }
1.208 +}
1.209 +OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
1.210 +
1.211 +SSE_FUNCTION static void
1.212 +inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
1.213 +{
1.214 + /* Initial operations to align the destination pointer */
1.215 + for (; ((long)dest & 15) && (n > 0); n--) {
1.216 + *dest++ = 1.0 / *src1++;
1.217 + }
1.218 + for (; n >= 8; n -= 8) {
1.219 + __m128 xmm0, xmm1;
1.220 + /* While _mm_rcp_ps sounds promising, the results it gives are rather
1.221 + * different from the 1.0 / src1 reference implementation, so do that.
1.222 + */
1.223 + xmm0 = _mm_set_ps1(1.0);
1.224 + xmm1 = _mm_loadu_ps(src1);
1.225 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.226 + _mm_store_ps(dest, xmm0);
1.227 + xmm0 = _mm_set_ps1(1.0);
1.228 + xmm1 = _mm_loadu_ps(src1 + 4);
1.229 + xmm0 = _mm_div_ps(xmm0, xmm1);
1.230 + _mm_store_ps(dest + 4, xmm0);
1.231 + dest += 8;
1.232 + src1 += 8;
1.233 + }
1.234 + for (; n > 0; n--) {
1.235 + *dest++ = 1.0 / *src1++;
1.236 + }
1.237 +}
1.238 +OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
1.239 +
1.240 +SSE_FUNCTION static void
1.241 +negative_f32_sse_unroll2 (float *dest, float *src1, int n)
1.242 +{
1.243 + /* Initial operations to align the destination pointer */
1.244 + for (; ((long)dest & 15) && (n > 0); n--) {
1.245 + *dest++ = -(*src1++);
1.246 + }
1.247 + for (; n >= 8; n -= 8) {
1.248 + __m128 xmm0, xmm1;
1.249 + xmm0 = _mm_setzero_ps();
1.250 + xmm1 = _mm_loadu_ps(src1);
1.251 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.252 + _mm_store_ps(dest, xmm0);
1.253 + xmm0 = _mm_setzero_ps();
1.254 + xmm1 = _mm_loadu_ps(src1 + 4);
1.255 + xmm0 = _mm_sub_ps(xmm0, xmm1);
1.256 + _mm_store_ps(dest + 4, xmm0);
1.257 + dest += 8;
1.258 + src1 += 8;
1.259 + }
1.260 + for (; n > 0; n--) {
1.261 + *dest++ = -(*src1++);
1.262 + }
1.263 +}
1.264 +OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
1.265 +
1.266 +SSE_FUNCTION static void
1.267 +scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
1.268 +{
1.269 + __m128 xmm1;
1.270 +
1.271 + /* Initial operations to align the destination pointer */
1.272 + for (; ((long)dest & 15) && (n > 0); n--) {
1.273 + *dest++ = *src1++ + *val;
1.274 + }
1.275 + xmm1 = _mm_load_ps1(val);
1.276 + for (; n >= 8; n -= 8) {
1.277 + __m128 xmm0;
1.278 + xmm0 = _mm_loadu_ps(src1);
1.279 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.280 + _mm_store_ps(dest, xmm0);
1.281 + xmm0 = _mm_loadu_ps(src1 + 4);
1.282 + xmm0 = _mm_add_ps(xmm0, xmm1);
1.283 + _mm_store_ps(dest + 4, xmm0);
1.284 + dest += 8;
1.285 + src1 += 8;
1.286 + }
1.287 + for (; n > 0; n--) {
1.288 + *dest++ = *src1++ + *val;
1.289 + }
1.290 +}
1.291 +OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
1.292 +
1.293 +SSE_FUNCTION static void
1.294 +scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
1.295 +{
1.296 + __m128 xmm1;
1.297 +
1.298 + /* Initial operations to align the destination pointer */
1.299 + for (; ((long)dest & 15) && (n > 0); n--) {
1.300 + *dest++ = *src1++ * *val;
1.301 + }
1.302 + xmm1 = _mm_load_ps1(val);
1.303 + for (; n >= 8; n -= 8) {
1.304 + __m128 xmm0;
1.305 + xmm0 = _mm_loadu_ps(src1);
1.306 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.307 + _mm_store_ps(dest, xmm0);
1.308 + xmm0 = _mm_loadu_ps(src1 + 4);
1.309 + xmm0 = _mm_mul_ps(xmm0, xmm1);
1.310 + _mm_store_ps(dest + 4, xmm0);
1.311 + dest += 8;
1.312 + src1 += 8;
1.313 + }
1.314 + for (; n > 0; n--) {
1.315 + *dest++ = *src1++ * *val;
1.316 + }
1.317 +}
1.318 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
1.319 +
1.320 +SSE_FUNCTION static void
1.321 +scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
1.322 +{
1.323 + __m128d xmm1;
1.324 +
1.325 + /* Initial operations to align the destination pointer */
1.326 + for (; ((long)dest & 15) && (n > 0); n--) {
1.327 + *dest++ = *src1++ * *val;
1.328 + }
1.329 + xmm1 = _mm_load_pd1(val);
1.330 + for (; n >= 4; n -= 4) {
1.331 + __m128d xmm0;
1.332 + xmm0 = _mm_loadu_pd(src1);
1.333 + xmm0 = _mm_mul_pd(xmm0, xmm1);
1.334 + _mm_store_pd(dest, xmm0);
1.335 + xmm0 = _mm_loadu_pd(src1 + 2);
1.336 + xmm0 = _mm_mul_pd(xmm0, xmm1);
1.337 + _mm_store_pd(dest + 2, xmm0);
1.338 + dest += 4;
1.339 + src1 += 4;
1.340 + }
1.341 + for (; n > 0; n--) {
1.342 + *dest++ = *src1++ * *val;
1.343 + }
1.344 +}
1.345 +OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
1.346 +
1.347 +
1.348 +
1.349 +#ifdef __SYMBIAN32__
1.350 +
1.351 +OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() {
1.352 + return &_oil_function_impl_add_f32_sse_unroll2, add_f32;
1.353 +}
1.354 +#endif
1.355 +
1.356 +#ifdef __SYMBIAN32__
1.357 +
1.358 +OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() {
1.359 + return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32;
1.360 +}
1.361 +#endif
1.362 +
1.363 +#ifdef __SYMBIAN32__
1.364 +
1.365 +OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() {
1.366 + return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32;
1.367 +}
1.368 +#endif
1.369 +
1.370 +#ifdef __SYMBIAN32__
1.371 +
1.372 +OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() {
1.373 + return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32;
1.374 +}
1.375 +#endif
1.376 +
1.377 +#ifdef __SYMBIAN32__
1.378 +
1.379 +OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() {
1.380 + return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32;
1.381 +}
1.382 +#endif
1.383 +
1.384 +#ifdef __SYMBIAN32__
1.385 +
1.386 +OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() {
1.387 + return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32;
1.388 +}
1.389 +#endif
1.390 +
1.391 +#ifdef __SYMBIAN32__
1.392 +
1.393 +OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() {
1.394 + return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32;
1.395 +}
1.396 +#endif
1.397 +
1.398 +#ifdef __SYMBIAN32__
1.399 +
1.400 +OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() {
1.401 + return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32;
1.402 +}
1.403 +#endif
1.404 +
1.405 +#ifdef __SYMBIAN32__
1.406 +
1.407 +OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() {
1.408 + return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns;
1.409 +}
1.410 +#endif
1.411 +
1.412 +#ifdef __SYMBIAN32__
1.413 +
1.414 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() {
1.415 + return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns;
1.416 +}
1.417 +#endif
1.418 +
1.419 +#ifdef __SYMBIAN32__
1.420 +
1.421 +OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() {
1.422 + return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns;
1.423 +}
1.424 +#endif
1.425 +