sl@0: /* sl@0: * Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). sl@0: * All rights reserved. sl@0: * This component and the accompanying materials are made available sl@0: * under the terms of "Eclipse Public License v1.0" sl@0: * which accompanies this distribution, and is available sl@0: * at the URL "http://www.eclipse.org/legal/epl-v10.html". sl@0: * sl@0: * Initial Contributors: sl@0: * Nokia Corporation - initial contribution. sl@0: * sl@0: * Contributors: sl@0: * sl@0: * Description: sl@0: * sl@0: */ sl@0: sl@0: #ifdef HAVE_CONFIG_H sl@0: #include "config.h" sl@0: #endif sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) sl@0: sl@0: #define MULTSUM_SSE2_NSTRIDED(i) { \ sl@0: t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \ sl@0: t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \ sl@0: t1 = _mm_mul_pd(t1,t2); \ sl@0: sum.reg = _mm_add_pd(sum.reg,t1); \ sl@0: } sl@0: #define MULTSUM_SSE2_NSTRIDEDP(i) { \ sl@0: t1 = _mm_load_pd(&OIL_GET(src1, i*sstr1, double)); \ sl@0: t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ sl@0: t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ sl@0: t1 = _mm_mul_pd(t1,t2); \ sl@0: sum.reg = _mm_add_pd(sum.reg,t1); \ sl@0: } sl@0: #define MULTSUM_SSE2_STRIDED(i) { \ sl@0: t1 = _mm_loadl_pd(t1, &OIL_GET(src1, i*sstr1, double)); \ sl@0: t1 = _mm_loadh_pd(t1, &OIL_GET(src1, (i+1)*sstr1, double)); \ sl@0: t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ sl@0: t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ sl@0: t1 = _mm_mul_pd(t1,t2); \ sl@0: sum.reg = _mm_add_pd(sum.reg,t1); \ sl@0: } sl@0: sl@0: sl@0: #ifdef ENABLE_BROKEN_IMPLS sl@0: SSE_FUNCTION static void sl@0: multsum_f64_sse2_unroll4(double *dest, sl@0: const double *src1, int sstr1, sl@0: const double *src2, int sstr2, sl@0: int n) sl@0: { sl@0: __m128d t1, t2; sl@0: union { sl@0: __m128d reg; sl@0: double vals[2]; sl@0: } sum; sl@0: int i = 0; sl@0: sl@0: sum.reg = _mm_setzero_pd(); sl@0: while (i < n-3) { sl@0: MULTSUM_SSE2_STRIDED(0); sl@0: MULTSUM_SSE2_STRIDED(2); sl@0: sl@0: OIL_INCREMENT(src1, 4*sstr1); sl@0: OIL_INCREMENT(src2, 4*sstr2); sl@0: i += 4; sl@0: } sl@0: while (i < n-1) { sl@0: MULTSUM_SSE2_STRIDED(0); sl@0: sl@0: OIL_INCREMENT(src1, 2*sstr1); sl@0: OIL_INCREMENT(src2, 2*sstr2); sl@0: i+=2; sl@0: } sl@0: *dest = sum.vals[0] + sum.vals[1]; sl@0: if (i < n) { sl@0: *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double)); sl@0: } sl@0: } sl@0: OIL_DEFINE_IMPL_FULL (multsum_f64_sse2_unroll4, multsum_f64, OIL_IMPL_FLAG_SSE2); sl@0: #endif sl@0: sl@0: sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64() { sl@0: return &_oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64; sl@0: } sl@0: #endif sl@0: