Update contrib.
3 * Eric Anholt. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
31 #include <liboil/liboilclasses.h>
32 #include <liboil/liboilfunction.h>
33 #include <emmintrin.h>
34 #include <xmmintrin.h>
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
38 SSE_FUNCTION static void
39 add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
41 /* Initial operations to align the destination pointer */
42 for (; ((long)dest & 15) && (n > 0); n--) {
43 *dest++ = *src1++ + *src2++;
45 for (; n >= 8; n -= 8) {
47 xmm0 = _mm_loadu_ps(src1);
48 xmm1 = _mm_loadu_ps(src2);
49 xmm0 = _mm_add_ps(xmm0, xmm1);
50 _mm_store_ps(dest, xmm0);
51 xmm0 = _mm_loadu_ps(src1 + 4);
52 xmm1 = _mm_loadu_ps(src2 + 4);
53 xmm0 = _mm_add_ps(xmm0, xmm1);
54 _mm_store_ps(dest + 4, xmm0);
60 *dest++ = *src1++ + *src2++;
63 OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
65 SSE_FUNCTION static void
66 subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
68 /* Initial operations to align the destination pointer */
69 for (; ((long)dest & 15) && (n > 0); n--) {
70 *dest++ = *src1++ - *src2++;
72 for (; n >= 8; n -= 8) {
74 xmm0 = _mm_loadu_ps(src1);
75 xmm1 = _mm_loadu_ps(src2);
76 xmm0 = _mm_sub_ps(xmm0, xmm1);
77 _mm_store_ps(dest, xmm0);
78 xmm0 = _mm_loadu_ps(src1 + 4);
79 xmm1 = _mm_loadu_ps(src2 + 4);
80 xmm0 = _mm_sub_ps(xmm0, xmm1);
81 _mm_store_ps(dest + 4, xmm0);
87 *dest++ = *src1++ - *src2++;
90 OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
92 SSE_FUNCTION static void
93 multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
95 /* Initial operations to align the destination pointer */
96 for (; ((long)dest & 15) && (n > 0); n--) {
97 *dest++ = *src1++ * *src2++;
99 for (; n >= 8; n -= 8) {
101 xmm0 = _mm_loadu_ps(src1);
102 xmm1 = _mm_loadu_ps(src2);
103 xmm0 = _mm_mul_ps(xmm0, xmm1);
104 _mm_store_ps(dest, xmm0);
105 xmm0 = _mm_loadu_ps(src1 + 4);
106 xmm1 = _mm_loadu_ps(src2 + 4);
107 xmm0 = _mm_mul_ps(xmm0, xmm1);
108 _mm_store_ps(dest + 4, xmm0);
114 *dest++ = *src1++ * *src2++;
117 OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
119 SSE_FUNCTION static void
120 divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
122 /* Initial operations to align the destination pointer */
123 for (; ((long)dest & 15) && (n > 0); n--) {
124 *dest++ = *src1++ / *src2++;
126 for (; n >= 8; n -= 8) {
128 xmm0 = _mm_loadu_ps(src1);
129 xmm1 = _mm_loadu_ps(src2);
130 xmm0 = _mm_div_ps(xmm0, xmm1);
131 _mm_store_ps(dest, xmm0);
132 xmm0 = _mm_loadu_ps(src1 + 4);
133 xmm1 = _mm_loadu_ps(src2 + 4);
134 xmm0 = _mm_div_ps(xmm0, xmm1);
135 _mm_store_ps(dest + 4, xmm0);
141 *dest++ = *src1++ / *src2++;
144 OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
146 SSE_FUNCTION static void
147 minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
149 /* Initial operations to align the destination pointer */
150 for (; ((long)dest & 15) && (n > 0); n--) {
151 *dest++ = *src1 < *src2 ? *src1 : *src2;
155 for (; n >= 8; n -= 8) {
157 xmm0 = _mm_loadu_ps(src1);
158 xmm1 = _mm_loadu_ps(src2);
159 xmm0 = _mm_min_ps(xmm0, xmm1);
160 _mm_store_ps(dest, xmm0);
161 xmm0 = _mm_loadu_ps(src1 + 4);
162 xmm1 = _mm_loadu_ps(src2 + 4);
163 xmm0 = _mm_min_ps(xmm0, xmm1);
164 _mm_store_ps(dest + 4, xmm0);
170 *dest++ = *src1 < *src2 ? *src1 : *src2;
175 OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
177 SSE_FUNCTION static void
178 maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
180 /* Initial operations to align the destination pointer */
181 for (; ((long)dest & 15) && (n > 0); n--) {
182 *dest++ = *src1 > *src2 ? *src1 : *src2;
186 for (; n >= 8; n -= 8) {
188 xmm0 = _mm_loadu_ps(src1);
189 xmm1 = _mm_loadu_ps(src2);
190 xmm0 = _mm_max_ps(xmm0, xmm1);
191 _mm_store_ps(dest, xmm0);
192 xmm0 = _mm_loadu_ps(src1 + 4);
193 xmm1 = _mm_loadu_ps(src2 + 4);
194 xmm0 = _mm_max_ps(xmm0, xmm1);
195 _mm_store_ps(dest + 4, xmm0);
201 *dest++ = *src1 > *src2 ? *src1 : *src2;
206 OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
208 SSE_FUNCTION static void
209 inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
211 /* Initial operations to align the destination pointer */
212 for (; ((long)dest & 15) && (n > 0); n--) {
213 *dest++ = 1.0 / *src1++;
215 for (; n >= 8; n -= 8) {
217 /* While _mm_rcp_ps sounds promising, the results it gives are rather
218 * different from the 1.0 / src1 reference implementation, so do that.
220 xmm0 = _mm_set_ps1(1.0);
221 xmm1 = _mm_loadu_ps(src1);
222 xmm0 = _mm_div_ps(xmm0, xmm1);
223 _mm_store_ps(dest, xmm0);
224 xmm0 = _mm_set_ps1(1.0);
225 xmm1 = _mm_loadu_ps(src1 + 4);
226 xmm0 = _mm_div_ps(xmm0, xmm1);
227 _mm_store_ps(dest + 4, xmm0);
232 *dest++ = 1.0 / *src1++;
235 OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
237 SSE_FUNCTION static void
238 negative_f32_sse_unroll2 (float *dest, float *src1, int n)
240 /* Initial operations to align the destination pointer */
241 for (; ((long)dest & 15) && (n > 0); n--) {
242 *dest++ = -(*src1++);
244 for (; n >= 8; n -= 8) {
246 xmm0 = _mm_setzero_ps();
247 xmm1 = _mm_loadu_ps(src1);
248 xmm0 = _mm_sub_ps(xmm0, xmm1);
249 _mm_store_ps(dest, xmm0);
250 xmm0 = _mm_setzero_ps();
251 xmm1 = _mm_loadu_ps(src1 + 4);
252 xmm0 = _mm_sub_ps(xmm0, xmm1);
253 _mm_store_ps(dest + 4, xmm0);
258 *dest++ = -(*src1++);
261 OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
263 SSE_FUNCTION static void
264 scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
268 /* Initial operations to align the destination pointer */
269 for (; ((long)dest & 15) && (n > 0); n--) {
270 *dest++ = *src1++ + *val;
272 xmm1 = _mm_load_ps1(val);
273 for (; n >= 8; n -= 8) {
275 xmm0 = _mm_loadu_ps(src1);
276 xmm0 = _mm_add_ps(xmm0, xmm1);
277 _mm_store_ps(dest, xmm0);
278 xmm0 = _mm_loadu_ps(src1 + 4);
279 xmm0 = _mm_add_ps(xmm0, xmm1);
280 _mm_store_ps(dest + 4, xmm0);
285 *dest++ = *src1++ + *val;
288 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
290 SSE_FUNCTION static void
291 scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
295 /* Initial operations to align the destination pointer */
296 for (; ((long)dest & 15) && (n > 0); n--) {
297 *dest++ = *src1++ * *val;
299 xmm1 = _mm_load_ps1(val);
300 for (; n >= 8; n -= 8) {
302 xmm0 = _mm_loadu_ps(src1);
303 xmm0 = _mm_mul_ps(xmm0, xmm1);
304 _mm_store_ps(dest, xmm0);
305 xmm0 = _mm_loadu_ps(src1 + 4);
306 xmm0 = _mm_mul_ps(xmm0, xmm1);
307 _mm_store_ps(dest + 4, xmm0);
312 *dest++ = *src1++ * *val;
315 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
317 SSE_FUNCTION static void
318 scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
322 /* Initial operations to align the destination pointer */
323 for (; ((long)dest & 15) && (n > 0); n--) {
324 *dest++ = *src1++ * *val;
326 xmm1 = _mm_load_pd1(val);
327 for (; n >= 4; n -= 4) {
329 xmm0 = _mm_loadu_pd(src1);
330 xmm0 = _mm_mul_pd(xmm0, xmm1);
331 _mm_store_pd(dest, xmm0);
332 xmm0 = _mm_loadu_pd(src1 + 2);
333 xmm0 = _mm_mul_pd(xmm0, xmm1);
334 _mm_store_pd(dest + 2, xmm0);
339 *dest++ = *src1++ * *val;
342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
348 OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() {
349 return &_oil_function_impl_add_f32_sse_unroll2, add_f32;
355 OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() {
356 return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32;
362 OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() {
363 return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32;
369 OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() {
370 return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32;
376 OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() {
377 return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32;
383 OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() {
384 return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32;
390 OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() {
391 return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32;
397 OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() {
398 return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32;
404 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() {
405 return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns;
411 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() {
412 return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns;
418 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() {
419 return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns;