Update contrib.
3 * Eric Anholt. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
31 #include "liboil/liboilclasses.h"
32 #include "liboil/liboilfunction.h"
33 #include <emmintrin.h>
34 #include <xmmintrin.h>
36 /* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */
37 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
39 SSE_FUNCTION static void
40 clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1,
44 uint8_t min = *src2_1;
45 uint8_t max = *src3_1;
47 /* Initial operations to align the destination pointer */
48 for (; ((long)dest & 15) && (n > 0); n--) {
56 xmm1 = _mm_set1_epi8(min);
57 xmm2 = _mm_set1_epi8(max);
58 for (; n >= 16; n -= 16) {
60 xmm0 = _mm_loadu_si128((__m128i *)src1);
61 xmm0 = _mm_max_epu8(xmm0, xmm1);
62 xmm0 = _mm_min_epu8(xmm0, xmm2);
63 _mm_store_si128((__m128i *)dest, xmm0);
76 OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2);
78 SSE_FUNCTION static void
79 clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1,
83 int16_t min = *src2_1;
84 int16_t max = *src3_1;
86 /* Initial operations to align the destination pointer */
87 for (; ((long)dest & 15) && (n > 0); n--) {
95 xmm1 = _mm_set1_epi16(min);
96 xmm2 = _mm_set1_epi16(max);
97 for (; n >= 8; n -= 8) {
99 xmm0 = _mm_loadu_si128((__m128i *)src1);
100 xmm0 = _mm_max_epi16(xmm0, xmm1);
101 xmm0 = _mm_min_epi16(xmm0, xmm2);
102 _mm_store_si128((__m128i *)dest, xmm0);
115 OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2);
117 SSE_FUNCTION static void
118 clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1,
125 /* Initial operations to align the destination pointer */
126 for (; ((long)dest & 15) && (n > 0); n--) {
134 xmm1 = _mm_set_ps1(min);
135 xmm2 = _mm_set_ps1(max);
136 for (; n >= 4; n -= 4) {
138 xmm0 = _mm_loadu_ps(src1);
139 xmm0 = _mm_max_ps(xmm0, xmm1);
140 xmm0 = _mm_min_ps(xmm0, xmm2);
141 _mm_store_ps(dest, xmm0);
154 OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE);
156 SSE_FUNCTION static void
157 clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1,
158 const double *src3_1)
161 double min = *src2_1;
162 double max = *src3_1;
164 /* Initial operations to align the destination pointer */
165 for (; ((long)dest & 15) && (n > 0); n--) {
173 xmm1 = _mm_set1_pd(min);
174 xmm2 = _mm_set1_pd(max);
175 for (; n >= 2; n -= 2) {
177 xmm0 = _mm_loadu_pd(src1);
178 xmm0 = _mm_max_pd(xmm0, xmm1);
179 xmm0 = _mm_min_pd(xmm0, xmm2);
180 _mm_store_pd(dest, xmm0);
193 OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64,
194 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
196 SSE_FUNCTION static void
197 clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
198 const uint8_t *src2_1)
201 uint8_t min = *src2_1;
203 /* Initial operations to align the destination pointer */
204 for (; ((long)dest & 15) && (n > 0); n--) {
210 xmm1 = _mm_set1_epi8(min);
211 for (; n >= 16; n -= 16) {
213 xmm0 = _mm_loadu_si128((__m128i *)src1);
214 xmm0 = _mm_max_epu8(xmm0, xmm1);
215 _mm_store_si128((__m128i *)dest, xmm0);
226 OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2);
228 SSE_FUNCTION static void
229 clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n,
230 const int16_t *src2_1)
233 int16_t min = *src2_1;
235 /* Initial operations to align the destination pointer */
236 for (; ((long)dest & 15) && (n > 0); n--) {
242 xmm1 = _mm_set1_epi16(min);
243 for (; n >= 8; n -= 8) {
245 xmm0 = _mm_loadu_si128((__m128i *)src1);
246 xmm0 = _mm_max_epi16(xmm0, xmm1);
247 _mm_store_si128((__m128i *)dest, xmm0);
258 OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2);
260 SSE_FUNCTION static void
261 clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
266 /* Initial operations to align the destination pointer */
267 for (; ((long)dest & 15) && (n > 0); n--) {
273 xmm1 = _mm_set_ps1(min);
274 for (; n >= 4; n -= 4) {
276 xmm0 = _mm_loadu_ps(src1);
277 xmm0 = _mm_max_ps(xmm0, xmm1);
278 _mm_store_ps(dest, xmm0);
289 OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE);
291 SSE_FUNCTION static void
292 clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
295 double min = *src2_1;
297 /* Initial operations to align the destination pointer */
298 for (; ((long)dest & 15) && (n > 0); n--) {
304 xmm1 = _mm_set1_pd(min);
305 for (; n >= 2; n -= 2) {
307 xmm0 = _mm_loadu_pd(src1);
308 xmm0 = _mm_max_pd(xmm0, xmm1);
309 _mm_store_pd(dest, xmm0);
320 OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64,
321 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
323 SSE_FUNCTION static void
324 clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
325 const uint8_t *src2_1)
328 uint8_t max = *src2_1;
330 /* Initial operations to align the destination pointer */
331 for (; ((long)dest & 15) && (n > 0); n--) {
337 xmm1 = _mm_set1_epi8(max);
338 for (; n >= 16; n -= 16) {
340 xmm0 = _mm_loadu_si128((__m128i *)src1);
341 xmm0 = _mm_min_epu8(xmm0, xmm1);
342 _mm_store_si128((__m128i *)dest, xmm0);
353 OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2);
355 SSE_FUNCTION static void
356 clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n,
357 const int16_t *src2_1)
360 int16_t max = *src2_1;
362 /* Initial operations to align the destination pointer */
363 for (; ((long)dest & 15) && (n > 0); n--) {
369 xmm1 = _mm_set1_epi16(max);
370 for (; n >= 8; n -= 8) {
372 xmm0 = _mm_loadu_si128((__m128i *)src1);
373 xmm0 = _mm_min_epi16(xmm0, xmm1);
374 _mm_store_si128((__m128i *)dest, xmm0);
385 OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2);
387 SSE_FUNCTION static void
388 clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
393 /* Initial operations to align the destination pointer */
394 for (; ((long)dest & 15) && (n > 0); n--) {
400 xmm1 = _mm_set_ps1(max);
401 for (; n >= 4; n -= 4) {
403 xmm0 = _mm_loadu_ps(src1);
404 xmm0 = _mm_min_ps(xmm0, xmm1);
405 _mm_store_ps(dest, xmm0);
416 OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE);
418 SSE_FUNCTION static void
419 clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
422 double max = *src2_1;
424 /* Initial operations to align the destination pointer */
425 for (; ((long)dest & 15) && (n > 0); n--) {
431 xmm1 = _mm_set1_pd(max);
432 for (; n >= 2; n -= 2) {
434 xmm0 = _mm_loadu_pd(src1);
435 xmm0 = _mm_min_pd(xmm0, xmm1);
436 _mm_store_pd(dest, xmm0);
447 OIL_DEFINE_IMPL_FULL (clamphigh_f64_sse, clamphigh_f64,
448 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
453 OilFunctionImpl* __oil_function_impl_clamp_u8_sse, clamp_u8() {
454 return &_oil_function_impl_clamp_u8_sse, clamp_u8;
460 OilFunctionImpl* __oil_function_impl_clamp_s16_sse, clamp_s16() {
461 return &_oil_function_impl_clamp_s16_sse, clamp_s16;
467 OilFunctionImpl* __oil_function_impl_clamp_f32_sse, clamp_f32() {
468 return &_oil_function_impl_clamp_f32_sse, clamp_f32;
474 OilFunctionImpl* __oil_function_impl_clamp_f64_sse, clamp_f64() {
475 return &_oil_function_impl_clamp_f64_sse, clamp_f64;
481 OilFunctionImpl* __oil_function_impl_clamplow_u8_sse, clamplow_u8() {
482 return &_oil_function_impl_clamplow_u8_sse, clamplow_u8;
488 OilFunctionImpl* __oil_function_impl_clamplow_s16_sse, clamplow_s16() {
489 return &_oil_function_impl_clamplow_s16_sse, clamplow_s16;
495 OilFunctionImpl* __oil_function_impl_clamplow_f32_sse, clamplow_f32() {
496 return &_oil_function_impl_clamplow_f32_sse, clamplow_f32;
502 OilFunctionImpl* __oil_function_impl_clamplow_f64_sse, clamplow_f64() {
503 return &_oil_function_impl_clamplow_f64_sse, clamplow_f64;
509 OilFunctionImpl* __oil_function_impl_clamphigh_u8_sse, clamphigh_u8() {
510 return &_oil_function_impl_clamphigh_u8_sse, clamphigh_u8;
516 OilFunctionImpl* __oil_function_impl_clamphigh_s16_sse, clamphigh_s16() {
517 return &_oil_function_impl_clamphigh_s16_sse, clamphigh_s16;
523 OilFunctionImpl* __oil_function_impl_clamphigh_f32_sse, clamphigh_f32() {
524 return &_oil_function_impl_clamphigh_f32_sse, clamphigh_f32;
530 OilFunctionImpl* __oil_function_impl_clamphigh_f64_sse, clamphigh_f64() {
531 return &_oil_function_impl_clamphigh_f64_sse, clamphigh_f64;