1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/simdpack/squaresum_f64.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,176 @@
1.4 +/*
1.5 + * LIBOIL - Library of Optimized Inner Loops
1.6 + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
1.7 + * All rights reserved.
1.8 + *
1.9 + * Redistribution and use in source and binary forms, with or without
1.10 + * modification, are permitted provided that the following conditions
1.11 + * are met:
1.12 + * 1. Redistributions of source code must retain the above copyright
1.13 + * notice, this list of conditions and the following disclaimer.
1.14 + * 2. Redistributions in binary form must reproduce the above copyright
1.15 + * notice, this list of conditions and the following disclaimer in the
1.16 + * documentation and/or other materials provided with the distribution.
1.17 + *
1.18 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
1.19 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1.20 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.21 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1.22 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1.23 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1.24 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.25 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1.26 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1.27 + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.28 + * POSSIBILITY OF SUCH DAMAGE.
1.29 + */
1.30 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.31 +
1.32 +#ifdef HAVE_CONFIG_H
1.33 +#include "config.h"
1.34 +#endif
1.35 +
1.36 +#include <liboil/liboilfunction.h>
1.37 +#include "liboil/simdpack/simdpack.h"
1.38 +#include <math.h>
1.39 +
1.40 +static void
1.41 +squaresum_f64_i10_simple(double *dest, double *src, int n)
1.42 +{
1.43 + double sum2 = 0;
1.44 + int i;
1.45 +
1.46 + for(i=0;i<n;i++){
1.47 + sum2 += src[i]*src[i];
1.48 + }
1.49 +
1.50 + *dest = sum2;
1.51 +}
1.52 +OIL_DEFINE_IMPL (squaresum_f64_i10_simple, squaresum_f64);
1.53 +
1.54 +#if 0
1.55 +#include <multsum_f64.h>
1.56 +static void
1.57 +squaresum_f64_i10_multsum(double *dest, double *src, int n)
1.58 +{
1.59 + multsum_f64(dest,src,src,n);
1.60 +}
1.61 +#endif
1.62 +
1.63 +static void
1.64 +squaresum_f64_i10_unroll4a(double *dest, double *src, int n)
1.65 +{
1.66 + double sum1 = 0;
1.67 + double sum2 = 0;
1.68 + double sum3 = 0;
1.69 + double sum4 = 0;
1.70 +
1.71 + while(n&0x3){
1.72 + sum1 += *src * *src;
1.73 + src++;
1.74 + n--;
1.75 + }
1.76 + while(n>0){
1.77 + sum1 += *src * *src;
1.78 + src++;
1.79 + sum2 += *src * *src;
1.80 + src++;
1.81 + sum3 += *src * *src;
1.82 + src++;
1.83 + sum4 += *src * *src;
1.84 + src++;
1.85 + n-=4;
1.86 + }
1.87 +
1.88 + *dest = sum1 + sum2 + sum3 + sum4;
1.89 +}
1.90 +OIL_DEFINE_IMPL (squaresum_f64_i10_unroll4a, squaresum_f64);
1.91 +
1.92 +static void
1.93 +squaresum_f64_i10_unroll4(double *dest, double *src, int n)
1.94 +{
1.95 + double sum1 = 0;
1.96 + double sum2 = 0;
1.97 + double sum3 = 0;
1.98 + double sum4 = 0;
1.99 + int i;
1.100 +
1.101 + while(n&0x3){
1.102 + sum1 += src[0]*src[0];
1.103 + src++;
1.104 + n--;
1.105 + }
1.106 + for(i=0;i<n;i+=4){
1.107 + sum1 += src[i]*src[i];
1.108 + sum2 += src[i+1]*src[i+1];
1.109 + sum3 += src[i+2]*src[i+2];
1.110 + sum4 += src[i+3]*src[i+3];
1.111 + }
1.112 +
1.113 + *dest = sum1 + sum2 + sum3 + sum4;
1.114 +}
1.115 +OIL_DEFINE_IMPL (squaresum_f64_i10_unroll4, squaresum_f64);
1.116 +
1.117 +static void
1.118 +squaresum_f64_i10_unroll8(double *dest, double *src, int n)
1.119 +{
1.120 + double sum1 = 0;
1.121 + double sum2 = 0;
1.122 + double sum3 = 0;
1.123 + double sum4 = 0;
1.124 + double sum5 = 0;
1.125 + double sum6 = 0;
1.126 + double sum7 = 0;
1.127 + double sum8 = 0;
1.128 + int i;
1.129 +
1.130 + while(n&0x7){
1.131 + sum1 += src[0]*src[0];
1.132 + src++;
1.133 + n--;
1.134 + }
1.135 + for(i=0;i<n;i+=8){
1.136 + sum1 += src[i]*src[i];
1.137 + sum2 += src[i+1]*src[i+1];
1.138 + sum3 += src[i+2]*src[i+2];
1.139 + sum4 += src[i+3]*src[i+3];
1.140 + sum5 += src[i+4]*src[i+4];
1.141 + sum6 += src[i+5]*src[i+5];
1.142 + sum7 += src[i+6]*src[i+6];
1.143 + sum8 += src[i+7]*src[i+7];
1.144 + }
1.145 +
1.146 + *dest = sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8;
1.147 +}
1.148 +OIL_DEFINE_IMPL (squaresum_f64_i10_unroll8, squaresum_f64);
1.149 +
1.150 +
1.151 +
1.152 +#ifdef __SYMBIAN32__
1.153 +
1.154 +OilFunctionImpl* __oil_function_impl_squaresum_f64_i10_simple() {
1.155 + return &_oil_function_impl_squaresum_f64_i10_simple;
1.156 +}
1.157 +#endif
1.158 +
1.159 +#ifdef __SYMBIAN32__
1.160 +
1.161 +OilFunctionImpl* __oil_function_impl_squaresum_f64_i10_unroll4a() {
1.162 + return &_oil_function_impl_squaresum_f64_i10_unroll4a;
1.163 +}
1.164 +#endif
1.165 +
1.166 +#ifdef __SYMBIAN32__
1.167 +
1.168 +OilFunctionImpl* __oil_function_impl_squaresum_f64_i10_unroll4() {
1.169 + return &_oil_function_impl_squaresum_f64_i10_unroll4;
1.170 +}
1.171 +#endif
1.172 +
1.173 +#ifdef __SYMBIAN32__
1.174 +
1.175 +OilFunctionImpl* __oil_function_impl_squaresum_f64_i10_unroll8() {
1.176 + return &_oil_function_impl_squaresum_f64_i10_unroll8;
1.177 +}
1.178 +#endif
1.179 +