1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/clamp_sse.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,534 @@
1.4 +/*
1.5 + * Copyright (c) 2005
1.6 + * Eric Anholt. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.30 +
1.31 +#ifdef HAVE_CONFIG_H
1.32 +#include "config.h"
1.33 +#endif
1.34 +#include "liboil/liboilclasses.h"
1.35 +#include "liboil/liboilfunction.h"
1.36 +#include <emmintrin.h>
1.37 +#include <xmmintrin.h>
1.38 +
1.39 +/* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */
1.40 +#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
1.41 +
1.42 +SSE_FUNCTION static void
1.43 +clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1,
1.44 + uint8_t *src3_1)
1.45 +{
1.46 + __m128i xmm1, xmm2;
1.47 + uint8_t min = *src2_1;
1.48 + uint8_t max = *src3_1;
1.49 +
1.50 + /* Initial operations to align the destination pointer */
1.51 + for (; ((long)dest & 15) && (n > 0); n--) {
1.52 + uint8_t x = *src1++;
1.53 + if (x < min)
1.54 + x = min;
1.55 + if (x > max)
1.56 + x = max;
1.57 + *dest++ = x;
1.58 + }
1.59 + xmm1 = _mm_set1_epi8(min);
1.60 + xmm2 = _mm_set1_epi8(max);
1.61 + for (; n >= 16; n -= 16) {
1.62 + __m128i xmm0;
1.63 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.64 + xmm0 = _mm_max_epu8(xmm0, xmm1);
1.65 + xmm0 = _mm_min_epu8(xmm0, xmm2);
1.66 + _mm_store_si128((__m128i *)dest, xmm0);
1.67 + dest += 16;
1.68 + src1 += 16;
1.69 + }
1.70 + for (; n > 0; n--) {
1.71 + uint8_t x = *src1++;
1.72 + if (x < min)
1.73 + x = min;
1.74 + if (x > max)
1.75 + x = max;
1.76 + *dest++ = x;
1.77 + }
1.78 +}
1.79 +OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2);
1.80 +
1.81 +SSE_FUNCTION static void
1.82 +clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1,
1.83 + int16_t *src3_1)
1.84 +{
1.85 + __m128i xmm1, xmm2;
1.86 + int16_t min = *src2_1;
1.87 + int16_t max = *src3_1;
1.88 +
1.89 + /* Initial operations to align the destination pointer */
1.90 + for (; ((long)dest & 15) && (n > 0); n--) {
1.91 + int16_t x = *src1++;
1.92 + if (x < min)
1.93 + x = min;
1.94 + if (x > max)
1.95 + x = max;
1.96 + *dest++ = x;
1.97 + }
1.98 + xmm1 = _mm_set1_epi16(min);
1.99 + xmm2 = _mm_set1_epi16(max);
1.100 + for (; n >= 8; n -= 8) {
1.101 + __m128i xmm0;
1.102 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.103 + xmm0 = _mm_max_epi16(xmm0, xmm1);
1.104 + xmm0 = _mm_min_epi16(xmm0, xmm2);
1.105 + _mm_store_si128((__m128i *)dest, xmm0);
1.106 + dest += 8;
1.107 + src1 += 8;
1.108 + }
1.109 + for (; n > 0; n--) {
1.110 + int16_t x = *src1++;
1.111 + if (x < min)
1.112 + x = min;
1.113 + if (x > max)
1.114 + x = max;
1.115 + *dest++ = x;
1.116 + }
1.117 +}
1.118 +OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2);
1.119 +
1.120 +SSE_FUNCTION static void
1.121 +clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1,
1.122 + const float *src3_1)
1.123 +{
1.124 + __m128 xmm1, xmm2;
1.125 + float min = *src2_1;
1.126 + float max = *src3_1;
1.127 +
1.128 + /* Initial operations to align the destination pointer */
1.129 + for (; ((long)dest & 15) && (n > 0); n--) {
1.130 + float x = *src1++;
1.131 + if (x < min)
1.132 + x = min;
1.133 + if (x > max)
1.134 + x = max;
1.135 + *dest++ = x;
1.136 + }
1.137 + xmm1 = _mm_set_ps1(min);
1.138 + xmm2 = _mm_set_ps1(max);
1.139 + for (; n >= 4; n -= 4) {
1.140 + __m128 xmm0;
1.141 + xmm0 = _mm_loadu_ps(src1);
1.142 + xmm0 = _mm_max_ps(xmm0, xmm1);
1.143 + xmm0 = _mm_min_ps(xmm0, xmm2);
1.144 + _mm_store_ps(dest, xmm0);
1.145 + dest += 4;
1.146 + src1 += 4;
1.147 + }
1.148 + for (; n > 0; n--) {
1.149 + float x = *src1++;
1.150 + if (x < min)
1.151 + x = min;
1.152 + if (x > max)
1.153 + x = max;
1.154 + *dest++ = x;
1.155 + }
1.156 +}
1.157 +OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE);
1.158 +
1.159 +SSE_FUNCTION static void
1.160 +clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1,
1.161 + const double *src3_1)
1.162 +{
1.163 + __m128d xmm1, xmm2;
1.164 + double min = *src2_1;
1.165 + double max = *src3_1;
1.166 +
1.167 + /* Initial operations to align the destination pointer */
1.168 + for (; ((long)dest & 15) && (n > 0); n--) {
1.169 + double x = *src1++;
1.170 + if (x < min)
1.171 + x = min;
1.172 + if (x > max)
1.173 + x = max;
1.174 + *dest++ = x;
1.175 + }
1.176 + xmm1 = _mm_set1_pd(min);
1.177 + xmm2 = _mm_set1_pd(max);
1.178 + for (; n >= 2; n -= 2) {
1.179 + __m128d xmm0;
1.180 + xmm0 = _mm_loadu_pd(src1);
1.181 + xmm0 = _mm_max_pd(xmm0, xmm1);
1.182 + xmm0 = _mm_min_pd(xmm0, xmm2);
1.183 + _mm_store_pd(dest, xmm0);
1.184 + dest += 2;
1.185 + src1 += 2;
1.186 + }
1.187 + for (; n > 0; n--) {
1.188 + double x = *src1++;
1.189 + if (x < min)
1.190 + x = min;
1.191 + if (x > max)
1.192 + x = max;
1.193 + *dest++ = x;
1.194 + }
1.195 +}
1.196 +OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64,
1.197 + OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
1.198 +
1.199 +SSE_FUNCTION static void
1.200 +clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
1.201 + const uint8_t *src2_1)
1.202 +{
1.203 + __m128i xmm1;
1.204 + uint8_t min = *src2_1;
1.205 +
1.206 + /* Initial operations to align the destination pointer */
1.207 + for (; ((long)dest & 15) && (n > 0); n--) {
1.208 + uint8_t x = *src1++;
1.209 + if (x < min)
1.210 + x = min;
1.211 + *dest++ = x;
1.212 + }
1.213 + xmm1 = _mm_set1_epi8(min);
1.214 + for (; n >= 16; n -= 16) {
1.215 + __m128i xmm0;
1.216 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.217 + xmm0 = _mm_max_epu8(xmm0, xmm1);
1.218 + _mm_store_si128((__m128i *)dest, xmm0);
1.219 + dest += 16;
1.220 + src1 += 16;
1.221 + }
1.222 + for (; n > 0; n--) {
1.223 + uint8_t x = *src1++;
1.224 + if (x < min)
1.225 + x = min;
1.226 + *dest++ = x;
1.227 + }
1.228 +}
1.229 +OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2);
1.230 +
1.231 +SSE_FUNCTION static void
1.232 +clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n,
1.233 + const int16_t *src2_1)
1.234 +{
1.235 + __m128i xmm1;
1.236 + int16_t min = *src2_1;
1.237 +
1.238 + /* Initial operations to align the destination pointer */
1.239 + for (; ((long)dest & 15) && (n > 0); n--) {
1.240 + int16_t x = *src1++;
1.241 + if (x < min)
1.242 + x = min;
1.243 + *dest++ = x;
1.244 + }
1.245 + xmm1 = _mm_set1_epi16(min);
1.246 + for (; n >= 8; n -= 8) {
1.247 + __m128i xmm0;
1.248 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.249 + xmm0 = _mm_max_epi16(xmm0, xmm1);
1.250 + _mm_store_si128((__m128i *)dest, xmm0);
1.251 + dest += 8;
1.252 + src1 += 8;
1.253 + }
1.254 + for (; n > 0; n--) {
1.255 + int16_t x = *src1++;
1.256 + if (x < min)
1.257 + x = min;
1.258 + *dest++ = x;
1.259 + }
1.260 +}
1.261 +OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2);
1.262 +
1.263 +SSE_FUNCTION static void
1.264 +clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
1.265 +{
1.266 + __m128 xmm1;
1.267 + float min = *src2_1;
1.268 +
1.269 + /* Initial operations to align the destination pointer */
1.270 + for (; ((long)dest & 15) && (n > 0); n--) {
1.271 + float x = *src1++;
1.272 + if (x < min)
1.273 + x = min;
1.274 + *dest++ = x;
1.275 + }
1.276 + xmm1 = _mm_set_ps1(min);
1.277 + for (; n >= 4; n -= 4) {
1.278 + __m128 xmm0;
1.279 + xmm0 = _mm_loadu_ps(src1);
1.280 + xmm0 = _mm_max_ps(xmm0, xmm1);
1.281 + _mm_store_ps(dest, xmm0);
1.282 + dest += 4;
1.283 + src1 += 4;
1.284 + }
1.285 + for (; n > 0; n--) {
1.286 + float x = *src1++;
1.287 + if (x < min)
1.288 + x = min;
1.289 + *dest++ = x;
1.290 + }
1.291 +}
1.292 +OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE);
1.293 +
1.294 +SSE_FUNCTION static void
1.295 +clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
1.296 +{
1.297 + __m128d xmm1;
1.298 + double min = *src2_1;
1.299 +
1.300 + /* Initial operations to align the destination pointer */
1.301 + for (; ((long)dest & 15) && (n > 0); n--) {
1.302 + double x = *src1++;
1.303 + if (x < min)
1.304 + x = min;
1.305 + *dest++ = x;
1.306 + }
1.307 + xmm1 = _mm_set1_pd(min);
1.308 + for (; n >= 2; n -= 2) {
1.309 + __m128d xmm0;
1.310 + xmm0 = _mm_loadu_pd(src1);
1.311 + xmm0 = _mm_max_pd(xmm0, xmm1);
1.312 + _mm_store_pd(dest, xmm0);
1.313 + dest += 2;
1.314 + src1 += 2;
1.315 + }
1.316 + for (; n > 0; n--) {
1.317 + double x = *src1++;
1.318 + if (x < min)
1.319 + x = min;
1.320 + *dest++ = x;
1.321 + }
1.322 +}
1.323 +OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64,
1.324 + OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
1.325 +
1.326 +SSE_FUNCTION static void
1.327 +clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
1.328 + const uint8_t *src2_1)
1.329 +{
1.330 + __m128i xmm1;
1.331 + uint8_t max = *src2_1;
1.332 +
1.333 + /* Initial operations to align the destination pointer */
1.334 + for (; ((long)dest & 15) && (n > 0); n--) {
1.335 + uint8_t x = *src1++;
1.336 + if (x > max)
1.337 + x = max;
1.338 + *dest++ = x;
1.339 + }
1.340 + xmm1 = _mm_set1_epi8(max);
1.341 + for (; n >= 16; n -= 16) {
1.342 + __m128i xmm0;
1.343 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.344 + xmm0 = _mm_min_epu8(xmm0, xmm1);
1.345 + _mm_store_si128((__m128i *)dest, xmm0);
1.346 + dest += 16;
1.347 + src1 += 16;
1.348 + }
1.349 + for (; n > 0; n--) {
1.350 + uint8_t x = *src1++;
1.351 + if (x > max)
1.352 + x = max;
1.353 + *dest++ = x;
1.354 + }
1.355 +}
1.356 +OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2);
1.357 +
1.358 +SSE_FUNCTION static void
1.359 +clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n,
1.360 + const int16_t *src2_1)
1.361 +{
1.362 + __m128i xmm1;
1.363 + int16_t max = *src2_1;
1.364 +
1.365 + /* Initial operations to align the destination pointer */
1.366 + for (; ((long)dest & 15) && (n > 0); n--) {
1.367 + int16_t x = *src1++;
1.368 + if (x > max)
1.369 + x = max;
1.370 + *dest++ = x;
1.371 + }
1.372 + xmm1 = _mm_set1_epi16(max);
1.373 + for (; n >= 8; n -= 8) {
1.374 + __m128i xmm0;
1.375 + xmm0 = _mm_loadu_si128((__m128i *)src1);
1.376 + xmm0 = _mm_min_epi16(xmm0, xmm1);
1.377 + _mm_store_si128((__m128i *)dest, xmm0);
1.378 + dest += 8;
1.379 + src1 += 8;
1.380 + }
1.381 + for (; n > 0; n--) {
1.382 + int16_t x = *src1++;
1.383 + if (x > max)
1.384 + x = max;
1.385 + *dest++ = x;
1.386 + }
1.387 +}
1.388 +OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2);
1.389 +
1.390 +SSE_FUNCTION static void
1.391 +clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
1.392 +{
1.393 + __m128 xmm1;
1.394 + float max = *src2_1;
1.395 +
1.396 + /* Initial operations to align the destination pointer */
1.397 + for (; ((long)dest & 15) && (n > 0); n--) {
1.398 + float x = *src1++;
1.399 + if (x > max)
1.400 + x = max;
1.401 + *dest++ = x;
1.402 + }
1.403 + xmm1 = _mm_set_ps1(max);
1.404 + for (; n >= 4; n -= 4) {
1.405 + __m128 xmm0;
1.406 + xmm0 = _mm_loadu_ps(src1);
1.407 + xmm0 = _mm_min_ps(xmm0, xmm1);
1.408 + _mm_store_ps(dest, xmm0);
1.409 + dest += 4;
1.410 + src1 += 4;
1.411 + }
1.412 + for (; n > 0; n--) {
1.413 + float x = *src1++;
1.414 + if (x > max)
1.415 + x = max;
1.416 + *dest++ = x;
1.417 + }
1.418 +}
1.419 +OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE);
1.420 +
1.421 +SSE_FUNCTION static void
1.422 +clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
1.423 +{
1.424 + __m128d xmm1;
1.425 + double max = *src2_1;
1.426 +
1.427 + /* Initial operations to align the destination pointer */
1.428 + for (; ((long)dest & 15) && (n > 0); n--) {
1.429 + double x = *src1++;
1.430 + if (x > max)
1.431 + x = max;
1.432 + *dest++ = x;
1.433 + }
1.434 + xmm1 = _mm_set1_pd(max);
1.435 + for (; n >= 2; n -= 2) {
1.436 + __m128d xmm0;
1.437 + xmm0 = _mm_loadu_pd(src1);
1.438 + xmm0 = _mm_min_pd(xmm0, xmm1);
1.439 + _mm_store_pd(dest, xmm0);
1.440 + dest += 2;
1.441 + src1 += 2;
1.442 + }
1.443 + for (; n > 0; n--) {
1.444 + double x = *src1++;
1.445 + if (x > max)
1.446 + x = max;
1.447 + *dest++ = x;
1.448 + }
1.449 +}
1.450 +OIL_DEFINE_IMPL_FULL (clamphigh_f64_sse, clamphigh_f64,
1.451 + OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
1.452 +
1.453 +
1.454 +#ifdef __SYMBIAN32__
1.455 +
1.456 +OilFunctionImpl* __oil_function_impl_clamp_u8_sse, clamp_u8() {
1.457 + return &_oil_function_impl_clamp_u8_sse, clamp_u8;
1.458 +}
1.459 +#endif
1.460 +
1.461 +#ifdef __SYMBIAN32__
1.462 +
1.463 +OilFunctionImpl* __oil_function_impl_clamp_s16_sse, clamp_s16() {
1.464 + return &_oil_function_impl_clamp_s16_sse, clamp_s16;
1.465 +}
1.466 +#endif
1.467 +
1.468 +#ifdef __SYMBIAN32__
1.469 +
1.470 +OilFunctionImpl* __oil_function_impl_clamp_f32_sse, clamp_f32() {
1.471 + return &_oil_function_impl_clamp_f32_sse, clamp_f32;
1.472 +}
1.473 +#endif
1.474 +
1.475 +#ifdef __SYMBIAN32__
1.476 +
1.477 +OilFunctionImpl* __oil_function_impl_clamp_f64_sse, clamp_f64() {
1.478 + return &_oil_function_impl_clamp_f64_sse, clamp_f64;
1.479 +}
1.480 +#endif
1.481 +
1.482 +#ifdef __SYMBIAN32__
1.483 +
1.484 +OilFunctionImpl* __oil_function_impl_clamplow_u8_sse, clamplow_u8() {
1.485 + return &_oil_function_impl_clamplow_u8_sse, clamplow_u8;
1.486 +}
1.487 +#endif
1.488 +
1.489 +#ifdef __SYMBIAN32__
1.490 +
1.491 +OilFunctionImpl* __oil_function_impl_clamplow_s16_sse, clamplow_s16() {
1.492 + return &_oil_function_impl_clamplow_s16_sse, clamplow_s16;
1.493 +}
1.494 +#endif
1.495 +
1.496 +#ifdef __SYMBIAN32__
1.497 +
1.498 +OilFunctionImpl* __oil_function_impl_clamplow_f32_sse, clamplow_f32() {
1.499 + return &_oil_function_impl_clamplow_f32_sse, clamplow_f32;
1.500 +}
1.501 +#endif
1.502 +
1.503 +#ifdef __SYMBIAN32__
1.504 +
1.505 +OilFunctionImpl* __oil_function_impl_clamplow_f64_sse, clamplow_f64() {
1.506 + return &_oil_function_impl_clamplow_f64_sse, clamplow_f64;
1.507 +}
1.508 +#endif
1.509 +
1.510 +#ifdef __SYMBIAN32__
1.511 +
1.512 +OilFunctionImpl* __oil_function_impl_clamphigh_u8_sse, clamphigh_u8() {
1.513 + return &_oil_function_impl_clamphigh_u8_sse, clamphigh_u8;
1.514 +}
1.515 +#endif
1.516 +
1.517 +#ifdef __SYMBIAN32__
1.518 +
1.519 +OilFunctionImpl* __oil_function_impl_clamphigh_s16_sse, clamphigh_s16() {
1.520 + return &_oil_function_impl_clamphigh_s16_sse, clamphigh_s16;
1.521 +}
1.522 +#endif
1.523 +
1.524 +#ifdef __SYMBIAN32__
1.525 +
1.526 +OilFunctionImpl* __oil_function_impl_clamphigh_f32_sse, clamphigh_f32() {
1.527 + return &_oil_function_impl_clamphigh_f32_sse, clamphigh_f32;
1.528 +}
1.529 +#endif
1.530 +
1.531 +#ifdef __SYMBIAN32__
1.532 +
1.533 +OilFunctionImpl* __oil_function_impl_clamphigh_f64_sse, clamphigh_f64() {
1.534 + return &_oil_function_impl_clamphigh_f64_sse, clamphigh_f64;
1.535 +}
1.536 +#endif
1.537 +