Update contrib.
2 * Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
4 * This component and the accompanying materials are made available
5 * under the terms of "Eclipse Public License v1.0"
6 * which accompanies this distribution, and is available
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
9 * Initial Contributors:
10 * Nokia Corporation - initial contribution.
21 #include <liboil/liboilclasses.h>
22 #include <liboil/liboilfunction.h>
23 #include <emmintrin.h>
25 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
27 #define MULTSUM_SSE2_NSTRIDED(i) { \
28 t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \
29 t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \
30 t1 = _mm_mul_pd(t1,t2); \
31 sum.reg = _mm_add_pd(sum.reg,t1); \
33 #define MULTSUM_SSE2_NSTRIDEDP(i) { \
34 t1 = _mm_load_pd(&OIL_GET(src1, i*sstr1, double)); \
35 t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \
36 t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \
37 t1 = _mm_mul_pd(t1,t2); \
38 sum.reg = _mm_add_pd(sum.reg,t1); \
40 #define MULTSUM_SSE2_STRIDED(i) { \
41 t1 = _mm_loadl_pd(t1, &OIL_GET(src1, i*sstr1, double)); \
42 t1 = _mm_loadh_pd(t1, &OIL_GET(src1, (i+1)*sstr1, double)); \
43 t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \
44 t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \
45 t1 = _mm_mul_pd(t1,t2); \
46 sum.reg = _mm_add_pd(sum.reg,t1); \
50 #ifdef ENABLE_BROKEN_IMPLS
51 SSE_FUNCTION static void
52 multsum_f64_sse2_unroll4(double *dest,
53 const double *src1, int sstr1,
54 const double *src2, int sstr2,
64 sum.reg = _mm_setzero_pd();
66 MULTSUM_SSE2_STRIDED(0);
67 MULTSUM_SSE2_STRIDED(2);
69 OIL_INCREMENT(src1, 4*sstr1);
70 OIL_INCREMENT(src2, 4*sstr2);
74 MULTSUM_SSE2_STRIDED(0);
76 OIL_INCREMENT(src1, 2*sstr1);
77 OIL_INCREMENT(src2, 2*sstr2);
80 *dest = sum.vals[0] + sum.vals[1];
82 *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double));
85 OIL_DEFINE_IMPL_FULL (multsum_f64_sse2_unroll4, multsum_f64, OIL_IMPL_FLAG_SSE2);
92 OilFunctionImpl* __oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64() {
93 return &_oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64;