1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/i386/error8x8_i386.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,373 @@
1.4 +/*
1.5 + * LIBOIL - Library of Optimized Inner Loops
1.6 + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
1.7 + * All rights reserved.
1.8 + *
1.9 + * Redistribution and use in source and binary forms, with or without
1.10 + * modification, are permitted provided that the following conditions
1.11 + * are met:
1.12 + * 1. Redistributions of source code must retain the above copyright
1.13 + * notice, this list of conditions and the following disclaimer.
1.14 + * 2. Redistributions in binary form must reproduce the above copyright
1.15 + * notice, this list of conditions and the following disclaimer in the
1.16 + * documentation and/or other materials provided with the distribution.
1.17 + *
1.18 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
1.19 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1.20 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.21 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1.22 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1.23 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1.24 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.25 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1.26 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1.27 + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.28 + * POSSIBILITY OF SUCH DAMAGE.
1.29 + */
1.30 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.31 +
1.32 +#ifdef HAVE_CONFIG_H
1.33 +#include "config.h"
1.34 +#endif
1.35 +
1.36 +#include <liboil/liboilfunction.h>
1.37 +
1.38 +OIL_DECLARE_CLASS (err_intra8x8_u8);
1.39 +OIL_DECLARE_CLASS (err_inter8x8_u8);
1.40 +OIL_DECLARE_CLASS (err_inter8x8_u8_avg);
1.41 +
1.42 +static void
1.43 +err_intra8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1)
1.44 +{
1.45 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.46 + uint32_t xsum;
1.47 + uint32_t xxsum;
1.48 +
1.49 + __asm__ __volatile__ (
1.50 + " pxor %%mm5, %%mm5 \n\t"
1.51 + " pxor %%mm6, %%mm6 \n\t"
1.52 + " pxor %%mm7, %%mm7 \n\t"
1.53 + " mov $8, %%edi \n\t"
1.54 + "1: \n\t"
1.55 + " movq (%2), %%mm0 \n\t" /* take 8 bytes */
1.56 + " movq %%mm0, %%mm2 \n\t"
1.57 +
1.58 + " punpcklbw %%mm6, %%mm0 \n\t"
1.59 + " punpckhbw %%mm6, %%mm2 \n\t"
1.60 +
1.61 + " paddw %%mm0, %%mm5 \n\t"
1.62 + " paddw %%mm2, %%mm5 \n\t"
1.63 +
1.64 + " pmaddwd %%mm0, %%mm0 \n\t"
1.65 + " pmaddwd %%mm2, %%mm2 \n\t"
1.66 +
1.67 + " paddd %%mm0, %%mm7 \n\t"
1.68 + " paddd %%mm2, %%mm7 \n\t"
1.69 +
1.70 + " add %3, %2 \n\t" /* Inc pointer into src data */
1.71 +
1.72 + " dec %%edi \n\t"
1.73 + " jnz 1b \n\t"
1.74 +
1.75 + " movq %%mm5, %%mm0 \n\t"
1.76 + " psrlq $32, %%mm5 \n\t"
1.77 + " paddw %%mm0, %%mm5 \n\t"
1.78 + " movq %%mm5, %%mm0 \n\t"
1.79 + " psrlq $16, %%mm5 \n\t"
1.80 + " paddw %%mm0, %%mm5 \n\t"
1.81 + " movd %%mm5, %%edi \n\t"
1.82 + " movswl %%di, %%edi \n\t"
1.83 + " movl %%edi, %0 \n\t"
1.84 +
1.85 + " movq %%mm7, %%mm0 \n\t"
1.86 + " psrlq $32, %%mm7 \n\t"
1.87 + " paddd %%mm0, %%mm7 \n\t"
1.88 + " movd %%mm7, %1 \n\t"
1.89 + " emms \n\t"
1.90 +
1.91 + : "=r" (xsum),
1.92 + "=r" (xxsum),
1.93 + "+r" (src1)
1.94 + : "r" (ss1)
1.95 + : "edi", "memory"
1.96 + );
1.97 +
1.98 + /* Compute population variance as mis-match metric. */
1.99 + *dest = (((xxsum<<6) - xsum*xsum));
1.100 +#endif
1.101 +}
1.102 +OIL_DEFINE_IMPL_FULL (err_intra8x8_u8_mmx, err_intra8x8_u8, OIL_IMPL_FLAG_MMX);
1.103 +
1.104 +static void
1.105 +err_inter8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
1.106 +{
1.107 + uint32_t xsum;
1.108 + uint32_t xxsum;
1.109 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.110 + __asm__ __volatile__ (
1.111 + " pxor %%mm5, %%mm5 \n\t"
1.112 + " pxor %%mm6, %%mm6 \n\t"
1.113 + " pxor %%mm7, %%mm7 \n\t"
1.114 + " mov $8, %%edi \n\t"
1.115 + "1: \n\t"
1.116 + " movq (%2), %%mm0 \n\t" /* take 8 bytes */
1.117 + " movq (%3), %%mm1 \n\t"
1.118 + " movq %%mm0, %%mm2 \n\t"
1.119 + " movq %%mm1, %%mm3 \n\t"
1.120 +
1.121 + " punpcklbw %%mm6, %%mm0 \n\t"
1.122 + " punpcklbw %%mm6, %%mm1 \n\t"
1.123 + " punpckhbw %%mm6, %%mm2 \n\t"
1.124 + " punpckhbw %%mm6, %%mm3 \n\t"
1.125 +
1.126 + " psubsw %%mm1, %%mm0 \n\t"
1.127 + " psubsw %%mm3, %%mm2 \n\t"
1.128 +
1.129 + " paddw %%mm0, %%mm5 \n\t"
1.130 + " paddw %%mm2, %%mm5 \n\t"
1.131 +
1.132 + " pmaddwd %%mm0, %%mm0 \n\t"
1.133 + " pmaddwd %%mm2, %%mm2 \n\t"
1.134 +
1.135 + " paddd %%mm0, %%mm7 \n\t"
1.136 + " paddd %%mm2, %%mm7 \n\t"
1.137 +
1.138 + " add %4, %2 \n\t" /* Inc pointer into src data */
1.139 + " add %5, %3 \n\t" /* Inc pointer into ref data */
1.140 +
1.141 + " dec %%edi \n\t"
1.142 + " jnz 1b \n\t"
1.143 +
1.144 + " movq %%mm5, %%mm0 \n\t"
1.145 + " psrlq $32, %%mm5 \n\t"
1.146 + " paddw %%mm0, %%mm5 \n\t"
1.147 + " movq %%mm5, %%mm0 \n\t"
1.148 + " psrlq $16, %%mm5 \n\t"
1.149 + " paddw %%mm0, %%mm5 \n\t"
1.150 + " movd %%mm5, %%edi \n\t"
1.151 + " movswl %%di, %%edi \n\t"
1.152 + " movl %%edi, %0 \n\t"
1.153 +
1.154 + " movq %%mm7, %%mm0 \n\t"
1.155 + " psrlq $32, %%mm7 \n\t"
1.156 + " paddd %%mm0, %%mm7 \n\t"
1.157 + " movd %%mm7, %1 \n\t"
1.158 + " emms \n\t"
1.159 +
1.160 + : "=m" (xsum),
1.161 + "=m" (xxsum),
1.162 + "+r" (src1),
1.163 + "+r" (src2)
1.164 + : "m" (ss1),
1.165 + "m" (ss2)
1.166 + : "edi", "memory"
1.167 + );
1.168 +
1.169 + /* Compute and return population variance as mis-match metric. */
1.170 + *dest = (((xxsum<<6) - xsum*xsum));
1.171 +#endif
1.172 +}
1.173 +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_mmx, err_inter8x8_u8, OIL_IMPL_FLAG_MMX);
1.174 +
1.175 +static void
1.176 +err_inter8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
1.177 +{
1.178 +#if !defined(__WINSCW__) && !defined(__WINS__)
1.179 + uint32_t xsum;
1.180 + uint32_t xxsum;
1.181 +
1.182 + __asm__ __volatile__ (
1.183 + " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
1.184 + " paddb %%mm4, %%mm4 \n\t"
1.185 + " pxor %%mm5, %%mm5 \n\t"
1.186 + " pxor %%mm6, %%mm6 \n\t"
1.187 + " pxor %%mm7, %%mm7 \n\t"
1.188 + " mov $8, %%edi \n\t"
1.189 + "1: \n\t"
1.190 + " movq (%2), %%mm0 \n\t" /* take 8 bytes */
1.191 +
1.192 + " movq (%3), %%mm2 \n\t"
1.193 + " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
1.194 + " movq %%mm2, %%mm1 \n\t"
1.195 + " pand %%mm3, %%mm1 \n\t"
1.196 + " pxor %%mm2, %%mm3 \n\t"
1.197 + " pand %%mm4, %%mm3 \n\t"
1.198 + " psrlq $1, %%mm3 \n\t"
1.199 + " paddb %%mm3, %%mm1 \n\t"
1.200 +
1.201 + " movq %%mm0, %%mm2 \n\t"
1.202 + " movq %%mm1, %%mm3 \n\t"
1.203 +
1.204 + " punpcklbw %%mm6, %%mm0 \n\t"
1.205 + " punpcklbw %%mm6, %%mm1 \n\t"
1.206 + " punpckhbw %%mm6, %%mm2 \n\t"
1.207 + " punpckhbw %%mm6, %%mm3 \n\t"
1.208 +
1.209 + " psubsw %%mm1, %%mm0 \n\t"
1.210 + " psubsw %%mm3, %%mm2 \n\t"
1.211 +
1.212 + " paddw %%mm0, %%mm5 \n\t"
1.213 + " paddw %%mm2, %%mm5 \n\t"
1.214 +
1.215 + " pmaddwd %%mm0, %%mm0 \n\t"
1.216 + " pmaddwd %%mm2, %%mm2 \n\t"
1.217 +
1.218 + " paddd %%mm0, %%mm7 \n\t"
1.219 + " paddd %%mm2, %%mm7 \n\t"
1.220 +
1.221 + " add %5, %2 \n\t" /* Inc pointer into src data */
1.222 + " add %6, %3 \n\t" /* Inc pointer into ref data */
1.223 + " add %6, %4 \n\t" /* Inc pointer into ref data */
1.224 +
1.225 + " dec %%edi \n\t"
1.226 + " jnz 1b \n\t"
1.227 +
1.228 + " movq %%mm5, %%mm0 \n\t"
1.229 + " psrlq $32, %%mm5 \n\t"
1.230 + " paddw %%mm0, %%mm5 \n\t"
1.231 + " movq %%mm5, %%mm0 \n\t"
1.232 + " psrlq $16, %%mm5 \n\t"
1.233 + " paddw %%mm0, %%mm5 \n\t"
1.234 + " movd %%mm5, %%edi \n\t"
1.235 + " movswl %%di, %%edi \n\t"
1.236 + " movl %%edi, %0 \n\t"
1.237 +
1.238 + " movq %%mm7, %%mm0 \n\t"
1.239 + " psrlq $32, %%mm7 \n\t"
1.240 + " paddd %%mm0, %%mm7 \n\t"
1.241 + " movd %%mm7, %1 \n\t"
1.242 + " emms \n\t"
1.243 +
1.244 + : "=m" (xsum),
1.245 + "=m" (xxsum),
1.246 + "+r" (src1),
1.247 + "+r" (src2),
1.248 + "+r" (src3)
1.249 + : "m" (ss1),
1.250 + "m" (ss2)
1.251 + : "edi", "memory"
1.252 + );
1.253 +
1.254 + /* Compute and return population variance as mis-match metric. */
1.255 + *dest = (((xxsum<<6) - xsum*xsum));
1.256 +#endif
1.257 +}
1.258 +
1.259 +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX);
1.260 +
1.261 +#ifdef ENABLE_BROKEN_IMPLS
1.262 +static void
1.263 +err_inter8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
1.264 +{
1.265 + uint32_t xsum;
1.266 + uint32_t xxsum;
1.267 +
1.268 + __asm__ __volatile__ (
1.269 + " pxor %%mm4, %%mm4 \n\t"
1.270 + " pxor %%mm5, %%mm5 \n\t"
1.271 + " mov $0x01010101, %%edi \n\t"
1.272 + " movd %%edi, %%mm6 \n\t"
1.273 + " punpcklbw %%mm6, %%mm6 \n\t"
1.274 + " pxor %%mm7, %%mm7 \n\t"
1.275 + " mov $8, %%edi \n\t"
1.276 + "1: \n\t"
1.277 + " movq (%2), %%mm0 \n\t" /* take 8 bytes */
1.278 +
1.279 + " movq (%3), %%mm2 \n\t"
1.280 + " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
1.281 + " movq %%mm1, %%mm3 \n\t"
1.282 + " pavgb %%mm2, %%mm1 \n\t"
1.283 + " pxor %%mm2, %%mm3 \n\t"
1.284 + " pand %%mm6, %%mm3 \n\t"
1.285 + " psubb %%mm3, %%mm1 \n\t"
1.286 +
1.287 + " movq %%mm0, %%mm2 \n\t"
1.288 + " movq %%mm1, %%mm3 \n\t"
1.289 +
1.290 + " punpcklbw %%mm4, %%mm0 \n\t"
1.291 + " punpcklbw %%mm4, %%mm1 \n\t"
1.292 + " punpckhbw %%mm4, %%mm2 \n\t"
1.293 + " punpckhbw %%mm4, %%mm3 \n\t"
1.294 +
1.295 + " psubsw %%mm1, %%mm0 \n\t"
1.296 + " psubsw %%mm3, %%mm2 \n\t"
1.297 +
1.298 + " paddw %%mm0, %%mm5 \n\t"
1.299 + " paddw %%mm2, %%mm5 \n\t"
1.300 +
1.301 + " pmaddwd %%mm0, %%mm0 \n\t"
1.302 + " pmaddwd %%mm2, %%mm2 \n\t"
1.303 +
1.304 + " paddd %%mm0, %%mm7 \n\t"
1.305 + " paddd %%mm2, %%mm7 \n\t"
1.306 +
1.307 + " add %5, %2 \n\t" /* Inc pointer into src data */
1.308 + " add %6, %3 \n\t" /* Inc pointer into ref data */
1.309 + " add %6, %4 \n\t" /* Inc pointer into ref data */
1.310 +
1.311 + " dec %%edi \n\t"
1.312 + " jnz 1b \n\t"
1.313 +
1.314 + " movq %%mm5, %%mm0 \n\t"
1.315 + " psrlq $32, %%mm5 \n\t"
1.316 + " paddw %%mm0, %%mm5 \n\t"
1.317 + " movq %%mm5, %%mm0 \n\t"
1.318 + " psrlq $16, %%mm5 \n\t"
1.319 + " paddw %%mm0, %%mm5 \n\t"
1.320 + " movd %%mm5, %%edi \n\t"
1.321 + " movswl %%di, %%edi \n\t"
1.322 + " movl %%edi, %0 \n\t"
1.323 +
1.324 + " movq %%mm7, %%mm0 \n\t"
1.325 + " psrlq $32, %%mm7 \n\t"
1.326 + " paddd %%mm0, %%mm7 \n\t"
1.327 + " movd %%mm7, %1 \n\t"
1.328 + " emms \n\t"
1.329 +
1.330 + : "=m" (xsum),
1.331 + "=m" (xxsum),
1.332 + "+r" (src1),
1.333 + "+r" (src2),
1.334 + "+r" (src3)
1.335 + : "m" (ss1),
1.336 + "m" (ss2)
1.337 + : "edi", "memory"
1.338 + );
1.339 +
1.340 + /* Compute and return population variance as mis-match metric. */
1.341 + *dest = (((xxsum<<6) - xsum*xsum));
1.342 +}
1.343 +
1.344 +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.345 +#endif
1.346 +
1.347 +
1.348 +
1.349 +#ifdef __SYMBIAN32__
1.350 +
1.351 +OilFunctionImpl* __oil_function_impl_err_intra8x8_u8_mmx, err_intra8x8_u8() {
1.352 + return &_oil_function_impl_err_intra8x8_u8_mmx, err_intra8x8_u8;
1.353 +}
1.354 +#endif
1.355 +
1.356 +#ifdef __SYMBIAN32__
1.357 +
1.358 +OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_mmx, err_inter8x8_u8() {
1.359 + return &_oil_function_impl_err_inter8x8_u8_mmx, err_inter8x8_u8;
1.360 +}
1.361 +#endif
1.362 +
1.363 +#ifdef __SYMBIAN32__
1.364 +
1.365 +OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg() {
1.366 + return &_oil_function_impl_err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg;
1.367 +}
1.368 +#endif
1.369 +
1.370 +#ifdef __SYMBIAN32__
1.371 +
1.372 +OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg() {
1.373 + return &_oil_function_impl_err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg;
1.374 +}
1.375 +#endif
1.376 +