1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/i386/rowcolsad8x8_i386.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,310 @@
1.4 +/*
1.5 + * LIBOIL - Library of Optimized Inner Loops
1.6 + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
1.7 + * All rights reserved.
1.8 + *
1.9 + * Redistribution and use in source and binary forms, with or without
1.10 + * modification, are permitted provided that the following conditions
1.11 + * are met:
1.12 + * 1. Redistributions of source code must retain the above copyright
1.13 + * notice, this list of conditions and the following disclaimer.
1.14 + * 2. Redistributions in binary form must reproduce the above copyright
1.15 + * notice, this list of conditions and the following disclaimer in the
1.16 + * documentation and/or other materials provided with the distribution.
1.17 + *
1.18 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
1.19 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1.20 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.21 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1.22 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1.23 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1.24 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.25 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1.26 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1.27 + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.28 + * POSSIBILITY OF SUCH DAMAGE.
1.29 + */
1.30 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.31 +
1.32 +#ifdef HAVE_CONFIG_H
1.33 +#include "config.h"
1.34 +#endif
1.35 +
1.36 +#include <liboil/liboilfunction.h>
1.37 +
1.38 +OIL_DECLARE_CLASS (rowsad8x8_u8);
1.39 +OIL_DECLARE_CLASS (colsad8x8_u8);
1.40 +
1.41 +static void
1.42 +rowsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, uint8_t *src2)
1.43 +{
1.44 + uint32_t MaxSad;
1.45 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.46 + __asm__ __volatile__ (
1.47 + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
1.48 + " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
1.49 + " movq (%1), %%mm0 \n\t" /* take 8 bytes */
1.50 + " movq (%2), %%mm1 \n\t"
1.51 +
1.52 + " movq %%mm0, %%mm2 \n\t"
1.53 + " psubusb %%mm1, %%mm0 \n\t" /* A - B */
1.54 + " psubusb %%mm2, %%mm1 \n\t" /* B - A */
1.55 + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
1.56 +
1.57 + " movq %%mm0, %%mm1 \n\t"
1.58 +
1.59 + " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
1.60 + " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
1.61 +
1.62 + " movq %%mm0, %%mm2 \n\t"
1.63 + " movq %%mm1, %%mm3 \n\t"
1.64 + " psrlq $32, %%mm2 \n\t" /* fold and add */
1.65 + " psrlq $32, %%mm3 \n\t"
1.66 + " paddw %%mm2, %%mm0 \n\t"
1.67 + " paddw %%mm3, %%mm1 \n\t"
1.68 + " movq %%mm0, %%mm2 \n\t"
1.69 + " movq %%mm1, %%mm3 \n\t"
1.70 + " psrlq $16, %%mm2 \n\t"
1.71 + " psrlq $16, %%mm3 \n\t"
1.72 + " paddw %%mm2, %%mm0 \n\t"
1.73 + " paddw %%mm3, %%mm1 \n\t"
1.74 +
1.75 + " psubusw %%mm0, %%mm1 \n\t"
1.76 + " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
1.77 + " movd %%mm1, %0 \n\t"
1.78 + " andl $0xffff, %0 \n\t"
1.79 + " emms \n\t"
1.80 +
1.81 + : "=m" (MaxSad),
1.82 + "+r" (src1),
1.83 + "+r" (src2)
1.84 + :
1.85 + : "memory"
1.86 + );
1.87 + *dest = MaxSad;
1.88 +#endif
1.89 +}
1.90 +OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmx, rowsad8x8_u8, OIL_IMPL_FLAG_MMX);
1.91 +
1.92 +static void
1.93 +rowsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, uint8_t *src2)
1.94 +{
1.95 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.96 + uint32_t MaxSad;
1.97 +
1.98 + __asm__ __volatile__ (
1.99 + " movd (%1), %%mm0 \n\t"
1.100 + " movd (%2), %%mm1 \n\t"
1.101 + " psadbw %%mm0, %%mm1 \n\t"
1.102 + " movd 4(%1), %%mm2 \n\t"
1.103 + " movd 4(%2), %%mm3 \n\t"
1.104 + " psadbw %%mm2, %%mm3 \n\t"
1.105 +
1.106 + " pmaxsw %%mm1, %%mm3 \n\t"
1.107 + " movd %%mm3, %0 \n\t"
1.108 + " andl $0xffff, %0 \n\t"
1.109 + " emms \n\t"
1.110 +
1.111 + : "=m" (MaxSad),
1.112 + "+r" (src1),
1.113 + "+r" (src2)
1.114 + :
1.115 + : "memory"
1.116 + );
1.117 + *dest = MaxSad;
1.118 +#endif
1.119 +}
1.120 +OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmxext, rowsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.121 +
1.122 +static void
1.123 +colsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
1.124 +{
1.125 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.126 + uint32_t MaxSad;
1.127 +
1.128 + __asm__ __volatile__ (
1.129 + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
1.130 + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
1.131 + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
1.132 + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
1.133 + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
1.134 + " mov $4, %%edi \n\t" /* 4 rows */
1.135 + "1: \n\t"
1.136 + " movq (%1), %%mm0 \n\t" /* take 8 bytes */
1.137 + " movq (%2), %%mm1 \n\t" /* take 8 bytes */
1.138 +
1.139 + " movq %%mm0, %%mm2 \n\t"
1.140 + " psubusb %%mm1, %%mm0 \n\t" /* A - B */
1.141 + " psubusb %%mm2, %%mm1 \n\t" /* B - A */
1.142 + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
1.143 + " movq %%mm0, %%mm1 \n\t"
1.144 +
1.145 + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
1.146 + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
1.147 + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
1.148 + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
1.149 + " add %3, %1 \n\t" /* Inc pointer into the new data */
1.150 + " add %3, %2 \n\t" /* Inc pointer into the new data */
1.151 +
1.152 + " dec %%edi \n\t"
1.153 + " jnz 1b \n\t"
1.154 +
1.155 + " mov $4, %%edi \n\t" /* 4 rows */
1.156 + "2: \n\t"
1.157 + " movq (%1), %%mm0 \n\t" /* take 8 bytes */
1.158 + " movq (%2), %%mm1 \n\t" /* take 8 bytes */
1.159 +
1.160 + " movq %%mm0, %%mm2 \n\t"
1.161 + " psubusb %%mm1, %%mm0 \n\t" /* A - B */
1.162 + " psubusb %%mm2, %%mm1 \n\t" /* B - A */
1.163 + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
1.164 + " movq %%mm0, %%mm1 \n\t"
1.165 +
1.166 + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
1.167 + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
1.168 + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
1.169 + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
1.170 + " add %3, %1 \n\t" /* Inc pointer into the new data */
1.171 + " add %3, %2 \n\t" /* Inc pointer into the new data */
1.172 +
1.173 + " dec %%edi \n\t"
1.174 + " jnz 2b \n\t"
1.175 +
1.176 + " psubusw %%mm6, %%mm7 \n\t"
1.177 + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
1.178 + " psubusw %%mm4, %%mm5 \n\t"
1.179 + " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
1.180 + " psubusw %%mm5, %%mm7 \n\t"
1.181 + " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
1.182 + " movq %%mm7, %%mm6 \n\t"
1.183 + " psrlq $32, %%mm6 \n\t"
1.184 + " psubusw %%mm6, %%mm7 \n\t"
1.185 + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
1.186 + " movq %%mm7, %%mm6 \n\t"
1.187 + " psrlq $16, %%mm6 \n\t"
1.188 + " psubusw %%mm6, %%mm7 \n\t"
1.189 + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
1.190 + " movd %%mm7, %0 \n\t"
1.191 + " andl $0xffff, %0 \n\t"
1.192 + " emms \n\t"
1.193 +
1.194 + : "=r" (MaxSad),
1.195 + "+r" (src1),
1.196 + "+r" (src2)
1.197 + : "r" (ss1)
1.198 + : "memory", "edi"
1.199 + );
1.200 + *dest = MaxSad;
1.201 +#endif
1.202 +}
1.203 +OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmx, colsad8x8_u8, OIL_IMPL_FLAG_MMX);
1.204 +
1.205 +static void
1.206 +colsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
1.207 +{
1.208 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.209 + uint32_t MaxSad;
1.210 +
1.211 + __asm__ __volatile__ (
1.212 + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
1.213 + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
1.214 + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
1.215 + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
1.216 + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
1.217 + " mov $4, %%edi \n\t" /* 4 rows */
1.218 + "1: \n\t"
1.219 + " movq (%1), %%mm0 \n\t" /* take 8 bytes */
1.220 + " movq (%2), %%mm1 \n\t" /* take 8 bytes */
1.221 +
1.222 + " movq %%mm0, %%mm2 \n\t"
1.223 + " psubusb %%mm1, %%mm0 \n\t" /* A - B */
1.224 + " psubusb %%mm2, %%mm1 \n\t" /* B - A */
1.225 + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
1.226 + " movq %%mm0, %%mm1 \n\t"
1.227 +
1.228 + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
1.229 + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
1.230 + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
1.231 + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
1.232 + " add %3, %1 \n\t" /* Inc pointer into the new data */
1.233 + " add %3, %2 \n\t" /* Inc pointer into the new data */
1.234 +
1.235 + " dec %%edi \n\t"
1.236 + " jnz 1b \n\t"
1.237 +
1.238 + " mov $4, %%edi \n\t" /* 4 rows */
1.239 + "2: \n\t"
1.240 + " movq (%1), %%mm0 \n\t" /* take 8 bytes */
1.241 + " movq (%2), %%mm1 \n\t" /* take 8 bytes */
1.242 +
1.243 + " movq %%mm0, %%mm2 \n\t"
1.244 + " psubusb %%mm1, %%mm0 \n\t" /* A - B */
1.245 + " psubusb %%mm2, %%mm1 \n\t" /* B - A */
1.246 + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
1.247 + " movq %%mm0, %%mm1 \n\t"
1.248 +
1.249 + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
1.250 + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
1.251 + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
1.252 + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
1.253 + " add %3, %1 \n\t" /* Inc pointer into the new data */
1.254 + " add %3, %2 \n\t" /* Inc pointer into the new data */
1.255 +
1.256 + " dec %%edi \n\t"
1.257 + " jnz 2b \n\t"
1.258 +
1.259 + " pmaxsw %%mm6, %%mm7 \n\t"
1.260 + " pmaxsw %%mm4, %%mm5 \n\t"
1.261 + " pmaxsw %%mm5, %%mm7 \n\t"
1.262 + " movq %%mm7, %%mm6 \n\t"
1.263 + " psrlq $32, %%mm6 \n\t"
1.264 + " pmaxsw %%mm6, %%mm7 \n\t"
1.265 + " movq %%mm7, %%mm6 \n\t"
1.266 + " psrlq $16, %%mm6 \n\t"
1.267 + " pmaxsw %%mm6, %%mm7 \n\t"
1.268 + " movd %%mm7, %0 \n\t"
1.269 + " andl $0xffff, %0 \n\t"
1.270 + " emms \n\t"
1.271 +
1.272 + : "=r" (MaxSad),
1.273 + "+r" (src1),
1.274 + "+r" (src2)
1.275 + : "r" (ss1)
1.276 + : "memory", "edi"
1.277 + );
1.278 +
1.279 + *dest = MaxSad;
1.280 +#endif
1.281 +}
1.282 +OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmxext, colsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.283 +
1.284 +
1.285 +
1.286 +#ifdef __SYMBIAN32__
1.287 +
1.288 +OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8() {
1.289 + return &_oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8;
1.290 +}
1.291 +#endif
1.292 +
1.293 +#ifdef __SYMBIAN32__
1.294 +
1.295 +OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8() {
1.296 + return &_oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8;
1.297 +}
1.298 +#endif
1.299 +
1.300 +#ifdef __SYMBIAN32__
1.301 +
1.302 +OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8() {
1.303 + return &_oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8;
1.304 +}
1.305 +#endif
1.306 +
1.307 +#ifdef __SYMBIAN32__
1.308 +
1.309 +OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8() {
1.310 + return &_oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8;
1.311 +}
1.312 +#endif
1.313 +