First public contribution.
3 * Eric Anholt. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
31 #include <liboilclasses.h>
32 #include <liboilfunction.h>
33 #include <emmintrin.h>
34 #include <xmmintrin.h>
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
38 SSE_FUNCTION static void
39 add_f32_sse (float *dest, float *src1, float *src2, int n)
41 /* Initial operations to align the destination pointer */
42 for (; ((long)dest & 15) && (n > 0); n--) {
43 *dest++ = *src1++ + *src2++;
45 for (; n >= 4; n -= 4) {
47 xmm0 = _mm_loadu_ps(src1);
48 xmm1 = _mm_loadu_ps(src2);
49 xmm0 = _mm_add_ps(xmm0, xmm1);
50 _mm_store_ps(dest, xmm0);
56 *dest++ = *src1++ + *src2++;
59 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
61 SSE_FUNCTION static void
62 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
65 while (((long)dest & 15) && (0 < n)) {
66 *dest++ = *src1++ + *src2++;
70 xmm0 = _mm_loadu_pd(src1);
71 xmm1 = _mm_loadu_pd(src2);
72 xmm0 = _mm_add_pd(xmm0, xmm1);
73 _mm_store_pd(dest, xmm0);
80 *dest++ = *src1++ + *src2++;
84 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
86 SSE_FUNCTION static void
87 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
90 while (((long)dest & 15) && (0 < n)) {
91 *dest++ = *src1++ + *src2++;
95 xmm0 = _mm_loadu_pd(src1);
96 xmm1 = _mm_loadu_pd(src2);
97 xmm0 = _mm_add_pd(xmm0, xmm1);
98 _mm_store_pd(dest, xmm0);
100 xmm0 = _mm_loadu_pd(src1+2);
101 xmm1 = _mm_loadu_pd(src2+2);
102 xmm0 = _mm_add_pd(xmm0, xmm1);
103 _mm_store_pd(dest+2, xmm0);
110 xmm0 = _mm_loadu_pd(src1);
111 xmm1 = _mm_loadu_pd(src2);
112 xmm0 = _mm_add_pd(xmm0, xmm1);
113 _mm_store_pd(dest, xmm0);
120 *dest++ = *src1++ + *src2++;
124 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
126 SSE_FUNCTION static void
127 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
129 /* Initial operations to align the destination pointer */
130 for (; ((long)dest & 15) && (n > 0); n--) {
131 *dest++ = *src1++ - *src2++;
133 for (; n >= 4; n -= 4) {
135 xmm0 = _mm_loadu_ps(src1);
136 xmm1 = _mm_loadu_ps(src2);
137 xmm0 = _mm_sub_ps(xmm0, xmm1);
138 _mm_store_ps(dest, xmm0);
144 *dest++ = *src1++ - *src2++;
147 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
149 SSE_FUNCTION static void
150 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
152 /* Initial operations to align the destination pointer */
153 for (; ((long)dest & 15) && (n > 0); n--) {
154 *dest++ = *src1++ * *src2++;
156 for (; n >= 4; n -= 4) {
158 xmm0 = _mm_loadu_ps(src1);
159 xmm1 = _mm_loadu_ps(src2);
160 xmm0 = _mm_mul_ps(xmm0, xmm1);
161 _mm_store_ps(dest, xmm0);
167 *dest++ = *src1++ * *src2++;
170 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
172 SSE_FUNCTION static void
173 divide_f32_sse (float *dest, float *src1, float *src2, int n)
175 /* Initial operations to align the destination pointer */
176 for (; ((long)dest & 15) && (n > 0); n--) {
177 *dest++ = *src1++ / *src2++;
179 for (; n >= 4; n -= 4) {
181 xmm0 = _mm_loadu_ps(src1);
182 xmm1 = _mm_loadu_ps(src2);
183 xmm0 = _mm_div_ps(xmm0, xmm1);
184 _mm_store_ps(dest, xmm0);
190 *dest++ = *src1++ / *src2++;
193 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
195 SSE_FUNCTION static void
196 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
198 /* Initial operations to align the destination pointer */
199 for (; ((long)dest & 15) && (n > 0); n--) {
200 *dest++ = *src1 < *src2 ? *src1 : *src2;
204 for (; n >= 4; n -= 4) {
206 xmm0 = _mm_loadu_ps(src1);
207 xmm1 = _mm_loadu_ps(src2);
208 xmm0 = _mm_min_ps(xmm0, xmm1);
209 _mm_store_ps(dest, xmm0);
215 *dest++ = *src1 < *src2 ? *src1 : *src2;
220 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
222 SSE_FUNCTION static void
223 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
225 /* Initial operations to align the destination pointer */
226 for (; ((long)dest & 15) && (n > 0); n--) {
227 *dest++ = *src1 > *src2 ? *src1 : *src2;
231 for (; n >= 4; n -= 4) {
233 xmm0 = _mm_loadu_ps(src1);
234 xmm1 = _mm_loadu_ps(src2);
235 xmm0 = _mm_max_ps(xmm0, xmm1);
236 _mm_store_ps(dest, xmm0);
242 *dest++ = *src1 > *src2 ? *src1 : *src2;
247 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
249 SSE_FUNCTION static void
250 inverse_f32_sse (float *dest, float *src1, int n)
252 /* Initial operations to align the destination pointer */
253 for (; ((long)dest & 15) && (n > 0); n--) {
254 *dest++ = 1.0 / *src1++;
256 for (; n >= 4; n -= 4) {
258 /* While _mm_rcp_ps sounds promising, the results it gives are rather
259 * different from the 1.0 / src1 reference implementation, so do that.
261 xmm0 = _mm_set_ps1(1.0);
262 xmm1 = _mm_loadu_ps(src1);
263 xmm0 = _mm_div_ps(xmm0, xmm1);
264 _mm_store_ps(dest, xmm0);
269 *dest++ = 1.0 / *src1++;
272 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
274 SSE_FUNCTION static void
275 negative_f32_sse (float *dest, float *src1, int n)
277 /* Initial operations to align the destination pointer */
278 for (; ((long)dest & 15) && (n > 0); n--) {
279 *dest++ = -(*src1++);
281 for (; n >= 4; n -= 4) {
283 xmm0 = _mm_setzero_ps();
284 xmm1 = _mm_loadu_ps(src1);
285 xmm0 = _mm_sub_ps(xmm0, xmm1);
286 _mm_store_ps(dest, xmm0);
291 *dest++ = -(*src1++);
294 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
296 SSE_FUNCTION static void
297 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
301 /* Initial operations to align the destination pointer */
302 for (; ((long)dest & 15) && (n > 0); n--) {
303 *dest++ = *src1++ + *val;
305 xmm1 = _mm_load_ps1(val);
306 for (; n >= 4; n -= 4) {
308 xmm0 = _mm_loadu_ps(src1);
309 xmm0 = _mm_add_ps(xmm0, xmm1);
310 _mm_store_ps(dest, xmm0);
315 *dest++ = *src1++ + *val;
318 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
320 SSE_FUNCTION static void
321 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
325 /* Initial operations to align the destination pointer */
326 for (; ((long)dest & 15) && (n > 0); n--) {
327 *dest++ = *src1++ * *val;
329 xmm1 = _mm_load_ps1(val);
330 for (; n >= 4; n -= 4) {
332 xmm0 = _mm_loadu_ps(src1);
333 xmm0 = _mm_mul_ps(xmm0, xmm1);
334 _mm_store_ps(dest, xmm0);
339 *dest++ = *src1++ * *val;
342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
344 SSE_FUNCTION static void
345 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
349 /* Initial operations to align the destination pointer */
350 for (; ((long)dest & 15) && (n > 0); n--) {
351 *dest++ = *src1++ * *val;
353 xmm1 = _mm_load_pd1(val);
354 for (; n >= 2; n -= 2) {
356 xmm0 = _mm_loadu_pd(src1);
357 xmm0 = _mm_mul_pd(xmm0, xmm1);
358 _mm_store_pd(dest, xmm0);
363 *dest++ = *src1++ * *val;
366 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
372 OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() {
373 return &_oil_function_impl_add_f32_sse, add_f32;
379 OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() {
380 return &_oil_function_impl_add_f64_sse2, add_f64;
386 OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() {
387 return &_oil_function_impl_add_f64_sse2_unroll, add_f64;
393 OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() {
394 return &_oil_function_impl_subtract_f32_sse, subtract_f32;
400 OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() {
401 return &_oil_function_impl_multiply_f32_sse, multiply_f32;
407 OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() {
408 return &_oil_function_impl_divide_f32_sse, divide_f32;
414 OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() {
415 return &_oil_function_impl_minimum_f32_sse, minimum_f32;
421 OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() {
422 return &_oil_function_impl_maximum_f32_sse, maximum_f32;
428 OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() {
429 return &_oil_function_impl_inverse_f32_sse, inverse_f32;
435 OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() {
436 return &_oil_function_impl_negative_f32_sse, negative_f32;
442 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() {
443 return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns;
449 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() {
450 return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns;
456 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() {
457 return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns;