1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/i386/wavelet.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,2347 @@
1.4 +//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
1.5 +/*
1.6 +Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
1.7 + All rights reserved.
1.8 +
1.9 + Redistribution and use in source and binary forms, with or without
1.10 + modification, are permitted provided that the following conditions
1.11 + are met:
1.12 + 1. Redistributions of source code must retain the above copyright
1.13 + notice, this list of conditions and the following disclaimer.
1.14 + 2. Redistributions in binary form must reproduce the above copyright
1.15 + notice, this list of conditions and the following disclaimer in the
1.16 + documentation and/or other materials provided with the distribution.
1.17 +
1.18 + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
1.19 + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1.20 + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.21 + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1.22 + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1.23 + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1.24 + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.25 + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1.26 + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1.27 + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.28 + POSSIBILITY OF SUCH DAMAGE.
1.29 +*/
1.30 +
1.31 +#include <liboil/liboilfunction.h>
1.32 +#include <liboil/liboilclasses.h>
1.33 +
1.34 +
1.35 +void
1.36 +split_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
1.37 +{
1.38 + int i;
1.39 +
1.40 + if (n == 0) return;
1.41 + /* predict */
1.42 + for(i=1;i<n*2-2;i+=2){
1.43 + d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 1);
1.44 + }
1.45 + d_2xn[n*2-1] = s_2xn[n*2-1] - s_2xn[n*2-2];
1.46 +
1.47 + /* update */
1.48 + d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
1.49 + for(i=2;i<n*2;i+=2){
1.50 + d_2xn[i] = s_2xn[i] + ((d_2xn[i-1] + d_2xn[i+1]) >> 2);
1.51 + }
1.52 +}
1.53 +OIL_DEFINE_IMPL (split_53_nomix, split_53);
1.54 +
1.55 +#if 0
1.56 +void
1.57 +synth_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
1.58 +{
1.59 + int i;
1.60 +
1.61 + /* predict */
1.62 + i_n[0] -= i_n[1] >> 1;
1.63 + for(i=2;i<n*2;i+=2){
1.64 + i_n[i] -= (i_n[i-1] + i_n[i+1]) >> 2;
1.65 + }
1.66 +
1.67 + /* update */
1.68 + for(i=1;i<n*2-2;i+=2){
1.69 + i_n[i] += (i_n[i+1] + i_n[i-1]) >> 1;
1.70 + }
1.71 + i_n[n*2-1] += i_n[n*2-2];
1.72 +}
1.73 +#endif
1.74 +
1.75 +
1.76 +void
1.77 +split_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
1.78 +{
1.79 + int i;
1.80 +
1.81 + if (n == 0) return;
1.82 + if (n == 1) {
1.83 + d_2xn[1] = s_2xn[1] - s_2xn[0];
1.84 + d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
1.85 + } else {
1.86 + d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
1.87 + d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
1.88 + d_2xn+=2;
1.89 + s_2xn+=2;
1.90 + for(i=0;i<(n*2-4)/2;i++){
1.91 + d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
1.92 + d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
1.93 + d_2xn+=2;
1.94 + s_2xn+=2;
1.95 + }
1.96 + d_2xn[1] = s_2xn[1] - s_2xn[0];
1.97 + d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
1.98 + }
1.99 +}
1.100 +OIL_DEFINE_IMPL (split_53_c, split_53);
1.101 +
1.102 +void
1.103 +synth_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
1.104 +{
1.105 + int i;
1.106 +
1.107 + if (n == 0) return;
1.108 + if (n == 1) {
1.109 + d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
1.110 + d_2xn[1] = s_2xn[1] + d_2xn[0];
1.111 + } else {
1.112 + d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
1.113 + for(i=2;i<n*2-2;i+=2){
1.114 + d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
1.115 + d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
1.116 + }
1.117 + d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
1.118 + d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
1.119 + d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
1.120 + }
1.121 +}
1.122 +OIL_DEFINE_IMPL (synth_53_c, synth_53);
1.123 +
1.124 +void
1.125 +deinterleave2_c_1 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.126 +{
1.127 + int i;
1.128 +
1.129 + for(i=0;i<n;i++) {
1.130 + d1[i] = s_2xn[2*i];
1.131 + d2[i] = s_2xn[2*i + 1];
1.132 + }
1.133 +}
1.134 +OIL_DEFINE_IMPL (deinterleave2_c_1, deinterleave2_s16);
1.135 +
1.136 +void
1.137 +deinterleave2_asm (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.138 +{
1.139 + if (n == 0) return;
1.140 +
1.141 + while (n&1) {
1.142 + d1[0] = s_2xn[0];
1.143 + d2[0] = s_2xn[1];
1.144 + d1++;
1.145 + d2++;
1.146 + s_2xn+=2;
1.147 + n--;
1.148 + }
1.149 +
1.150 + asm volatile ("\n"
1.151 + " mov %3, %%ecx\n"
1.152 + " sub $2, %%ecx\n"
1.153 + "1:\n"
1.154 + " movw (%1,%%ecx,4), %%ax\n"
1.155 + " movw %%ax, (%0,%%ecx,2)\n"
1.156 + " movw 2(%1,%%ecx,4), %%ax\n"
1.157 + " movw %%ax, (%2,%%ecx,2)\n"
1.158 + " movw 4(%1,%%ecx,4), %%ax\n"
1.159 + " movw %%ax, 2(%0,%%ecx,2)\n"
1.160 + " movw 6(%1,%%ecx,4), %%ax\n"
1.161 + " movw %%ax, 2(%2,%%ecx,2)\n"
1.162 + " sub $2, %%ecx\n"
1.163 + " jge 1b\n"
1.164 + : "+r" (d1), "+r" (s_2xn), "+r" (d2)
1.165 + : "m" (n)
1.166 + : "eax", "ecx");
1.167 +}
1.168 +OIL_DEFINE_IMPL (deinterleave2_asm, deinterleave2_s16);
1.169 +
1.170 +void
1.171 +deinterleave2_mmx (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.172 +{
1.173 + while (n&3) {
1.174 + d1[0] = s_2xn[0];
1.175 + d2[0] = s_2xn[1];
1.176 + d1++;
1.177 + d2++;
1.178 + s_2xn+=2;
1.179 + n--;
1.180 + }
1.181 + if (n==0) return;
1.182 +
1.183 + asm volatile ("\n"
1.184 + " xor %%ecx, %%ecx\n"
1.185 + "1:\n"
1.186 + " movq (%1,%%ecx,4), %%mm0\n"
1.187 + " movq 8(%1,%%ecx,4), %%mm1\n"
1.188 + " pslld $16, %%mm0\n"
1.189 + " pslld $16, %%mm1\n"
1.190 + " psrad $16, %%mm0\n"
1.191 + " psrad $16, %%mm1\n"
1.192 + " packssdw %%mm1, %%mm0\n"
1.193 + " movq %%mm0, (%0,%%ecx,2)\n"
1.194 + " movq (%1,%%ecx,4), %%mm0\n"
1.195 + " movq 8(%1,%%ecx,4), %%mm1\n"
1.196 + " psrad $16, %%mm0\n"
1.197 + " psrad $16, %%mm1\n"
1.198 + " packssdw %%mm1, %%mm0\n"
1.199 + " movq %%mm0, (%2,%%ecx,2)\n"
1.200 + " add $4, %%ecx\n"
1.201 + " cmp %3, %%ecx\n"
1.202 + " jl 1b\n"
1.203 + " emms\n"
1.204 + : "+r" (d1), "+r" (s_2xn), "+r" (d2)
1.205 + : "m" (n)
1.206 + : "eax", "ecx");
1.207 +}
1.208 +OIL_DEFINE_IMPL_FULL (deinterleave2_mmx, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
1.209 +
1.210 +void
1.211 +deinterleave2_mmx_2 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.212 +{
1.213 + while (n&3) {
1.214 + d1[0] = s_2xn[0];
1.215 + d2[0] = s_2xn[1];
1.216 + d1++;
1.217 + d2++;
1.218 + s_2xn+=2;
1.219 + n--;
1.220 + }
1.221 + if (n==0) return;
1.222 +
1.223 + asm volatile ("\n"
1.224 + " xor %%ecx, %%ecx\n"
1.225 + "1:\n"
1.226 + " pshufw $0xd8, (%1,%%ecx,4), %%mm0\n"
1.227 + " movd %%mm0, (%0,%%ecx,2)\n"
1.228 + " pshufw $0x8d, (%1,%%ecx,4), %%mm0\n"
1.229 + " movd %%mm0, (%2,%%ecx,2)\n"
1.230 + " add $2, %%ecx\n"
1.231 + " cmp %3, %%ecx\n"
1.232 + " jl 1b\n"
1.233 + " emms\n"
1.234 + : "+r" (d1), "+r" (s_2xn), "+r" (d2)
1.235 + : "m" (n)
1.236 + : "eax", "ecx");
1.237 +}
1.238 +OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_2, deinterleave2_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.239 +
1.240 +void
1.241 +deinterleave2_mmx_3 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.242 +{
1.243 + while (n&3) {
1.244 + d1[0] = s_2xn[0];
1.245 + d2[0] = s_2xn[1];
1.246 + d1++;
1.247 + d2++;
1.248 + s_2xn+=2;
1.249 + n--;
1.250 + }
1.251 + if (n==0) return;
1.252 +
1.253 + asm volatile ("\n"
1.254 + " xor %%ecx, %%ecx\n"
1.255 + "1:\n"
1.256 + " movq (%1,%%ecx,4), %%mm1\n"
1.257 + " movq (%1,%%ecx,4), %%mm2\n"
1.258 + " movq 8(%1,%%ecx,4), %%mm0\n"
1.259 + " punpcklwd %%mm0, %%mm1\n"
1.260 + " punpckhwd %%mm0, %%mm2\n"
1.261 + " movq %%mm1, %%mm0\n"
1.262 + " punpcklwd %%mm2, %%mm0\n"
1.263 + " punpckhwd %%mm2, %%mm1\n"
1.264 + " movq %%mm0, (%0,%%ecx,2)\n"
1.265 + " movq %%mm1, (%2,%%ecx,2)\n"
1.266 + " add $4, %%ecx\n"
1.267 + " cmp %3, %%ecx\n"
1.268 + " jl 1b\n"
1.269 + " emms\n"
1.270 + : "+r" (d1), "+r" (s_2xn), "+r" (d2)
1.271 + : "m" (n)
1.272 + : "eax", "ecx");
1.273 +}
1.274 +OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_3, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
1.275 +
1.276 +void
1.277 +deinterleave2_mmx_4 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
1.278 +{
1.279 + while (n&7) {
1.280 + d1[0] = s_2xn[0];
1.281 + d2[0] = s_2xn[1];
1.282 + d1++;
1.283 + d2++;
1.284 + s_2xn+=2;
1.285 + n--;
1.286 + }
1.287 + if (n==0) return;
1.288 +
1.289 + asm volatile ("\n"
1.290 + " xor %%ecx, %%ecx\n"
1.291 + "1:\n"
1.292 + " movq (%1,%%ecx,4), %%mm1\n"
1.293 + " movq %%mm1, %%mm2\n"
1.294 + " movq 8(%1,%%ecx,4), %%mm0\n"
1.295 + " movq 16(%1,%%ecx,4), %%mm5\n"
1.296 + " punpcklwd %%mm0, %%mm1\n"
1.297 + " movq %%mm5, %%mm6\n"
1.298 + " punpckhwd %%mm0, %%mm2\n"
1.299 + " movq 24(%1,%%ecx,4), %%mm4\n"
1.300 + " movq %%mm1, %%mm0\n"
1.301 + " punpcklwd %%mm4, %%mm5\n"
1.302 + " punpcklwd %%mm2, %%mm0\n"
1.303 + " punpckhwd %%mm4, %%mm6\n"
1.304 + " punpckhwd %%mm2, %%mm1\n"
1.305 + " movq %%mm5, %%mm4\n"
1.306 + " movq %%mm0, (%0,%%ecx,2)\n"
1.307 + " punpcklwd %%mm6, %%mm4\n"
1.308 + " movq %%mm1, (%2,%%ecx,2)\n"
1.309 + " punpckhwd %%mm6, %%mm5\n"
1.310 + " movq %%mm4, 8(%0,%%ecx,2)\n"
1.311 + " movq %%mm5, 8(%2,%%ecx,2)\n"
1.312 + " add $8, %%ecx\n"
1.313 + " cmp %3, %%ecx\n"
1.314 + " jl 1b\n"
1.315 + " emms\n"
1.316 + : "+r" (d1), "+r" (s_2xn), "+r" (d2)
1.317 + : "m" (n)
1.318 + : "eax", "ecx");
1.319 +}
1.320 +OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_4, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
1.321 +
1.322 +
1.323 +void
1.324 +lift_add_mult_shift12_i386_mmx (int16_t *d, int16_t *s1, int16_t *s2,
1.325 + int16_t *s3, int16_t *s4, int n)
1.326 +{
1.327 + uint32_t val = *s4;
1.328 +
1.329 + while (n&3) {
1.330 + d[0] = s1[0] + ((s4[0]*(s2[0] + s3[0]))>>12);
1.331 + d++;
1.332 + s1++;
1.333 + s2++;
1.334 + s3++;
1.335 + n--;
1.336 + }
1.337 + if (n==0) return;
1.338 +
1.339 + val = ((*(uint16_t *)s4)<<16) | (*(uint16_t *)s4);
1.340 + n>>=2;
1.341 + asm volatile ("\n"
1.342 + " mov %4, %%ecx\n"
1.343 + " movd %%ecx, %%mm7\n"
1.344 + " punpcklwd %%mm7, %%mm7\n"
1.345 + " mov %5, %%ecx\n"
1.346 + "1:\n"
1.347 + " movq 0(%2), %%mm0\n"
1.348 + " paddsw 0(%3), %%mm0\n"
1.349 + " movq %%mm0, %%mm1\n"
1.350 + " pmullw %%mm7, %%mm0\n"
1.351 + " pmulhw %%mm7, %%mm1\n"
1.352 + " psrlw $12, %%mm0\n"
1.353 + " psllw $4, %%mm1\n"
1.354 + " por %%mm1, %%mm0\n"
1.355 + " paddsw 0(%1), %%mm0\n"
1.356 + " movq %%mm0, 0(%0)\n"
1.357 + " add $8, %0\n"
1.358 + " add $8, %1\n"
1.359 + " add $8, %2\n"
1.360 + " add $8, %3\n"
1.361 + " decl %%ecx\n"
1.362 + " jne 1b\n"
1.363 + " emms\n"
1.364 + : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
1.365 + : "m" (val), "m" (n)
1.366 + : "ecx");
1.367 +}
1.368 +OIL_DEFINE_IMPL_FULL (lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12, OIL_IMPL_FLAG_MMX);
1.369 +
1.370 +void
1.371 +interleave2_mmx (int16_t *d_2xn, int16_t *s1, int16_t *s2, int n)
1.372 +{
1.373 + while (n&3) {
1.374 + d_2xn[0] = s1[0];
1.375 + d_2xn[1] = s2[0];
1.376 + s1++;
1.377 + s2++;
1.378 + d_2xn+=2;
1.379 + n--;
1.380 + }
1.381 + if (n==0) return;
1.382 +
1.383 + asm volatile ("\n"
1.384 + " xor %%ecx, %%ecx\n"
1.385 + "1:\n"
1.386 + " movq (%1,%%ecx,2), %%mm0\n"
1.387 + " movq (%2,%%ecx,2), %%mm1\n"
1.388 + " movq %%mm0, %%mm2\n"
1.389 + " punpckhwd %%mm1, %%mm0\n"
1.390 + " punpcklwd %%mm1, %%mm2\n"
1.391 + " movq %%mm2, (%0,%%ecx,4)\n"
1.392 + " movq %%mm0, 8(%0,%%ecx,4)\n"
1.393 + " add $4, %%ecx\n"
1.394 + " cmp %3, %%ecx\n"
1.395 + " jl 1b\n"
1.396 + " emms\n"
1.397 + : "+r" (d_2xn), "+r" (s1), "+r" (s2)
1.398 + : "m" (n)
1.399 + : "eax", "ecx");
1.400 +}
1.401 +OIL_DEFINE_IMPL_FULL (interleave2_mmx, interleave2_s16, OIL_IMPL_FLAG_MMX);
1.402 +
1.403 +void
1.404 +lift_add_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
1.405 +{
1.406 + while (n&3) {
1.407 + d[0] = s1[0] + ((s2[0] + s3[0])>>1);
1.408 + d++;
1.409 + s1++;
1.410 + s2++;
1.411 + s3++;
1.412 + n--;
1.413 + }
1.414 + if (n==0) return;
1.415 +
1.416 + asm volatile ("\n"
1.417 + " xor %%ecx, %%ecx\n"
1.418 + "1:\n"
1.419 + " movq (%2,%%ecx,2), %%mm1\n"
1.420 + " movq (%3,%%ecx,2), %%mm2\n"
1.421 + " paddw %%mm2, %%mm1\n"
1.422 + " psraw $1, %%mm1\n"
1.423 + " paddw (%1,%%ecx,2), %%mm1\n"
1.424 + " movq %%mm1, (%0,%%ecx,2)\n"
1.425 + " add $4, %%ecx\n"
1.426 + " cmp %4, %%ecx\n"
1.427 + " jl 1b\n"
1.428 + " emms\n"
1.429 + : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
1.430 + : "m" (n)
1.431 + : "ecx");
1.432 +}
1.433 +OIL_DEFINE_IMPL_FULL (lift_add_shift1_mmx, lift_add_shift1, OIL_IMPL_FLAG_MMX);
1.434 +
1.435 +void
1.436 +lift_sub_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
1.437 +{
1.438 + while (n&3) {
1.439 + d[0] = s1[0] - ((s2[0] + s3[0])>>1);
1.440 + d++;
1.441 + s1++;
1.442 + s2++;
1.443 + s3++;
1.444 + n--;
1.445 + }
1.446 + if (n==0) return;
1.447 +
1.448 + asm volatile ("\n"
1.449 + " xor %%ecx, %%ecx\n"
1.450 + "1:\n"
1.451 + " movq (%2,%%ecx,2), %%mm1\n"
1.452 + " movq (%3,%%ecx,2), %%mm2\n"
1.453 + " movq (%1,%%ecx,2), %%mm0\n"
1.454 + " paddw %%mm2, %%mm1\n"
1.455 + " psraw $1, %%mm1\n"
1.456 + " psubw %%mm1, %%mm0\n"
1.457 + " movq %%mm0, (%0,%%ecx,2)\n"
1.458 + " add $4, %%ecx\n"
1.459 + " cmp %4, %%ecx\n"
1.460 + " jl 1b\n"
1.461 + " emms\n"
1.462 + : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
1.463 + : "m" (n)
1.464 + : "ecx");
1.465 +}
1.466 +OIL_DEFINE_IMPL_FULL (lift_sub_shift1_mmx, lift_sub_shift1, OIL_IMPL_FLAG_MMX);
1.467 +
1.468 +void
1.469 +lift_add_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
1.470 +{
1.471 + while (n&3) {
1.472 + d[0] = s1[0] + ((s2[0] + s3[0])>>2);
1.473 + d++;
1.474 + s1++;
1.475 + s2++;
1.476 + s3++;
1.477 + n--;
1.478 + }
1.479 + if (n==0) return;
1.480 +
1.481 + asm volatile ("\n"
1.482 + " xor %%ecx, %%ecx\n"
1.483 + "1:\n"
1.484 + " movq (%2,%%ecx,2), %%mm1\n"
1.485 + " movq (%3,%%ecx,2), %%mm2\n"
1.486 + " paddw %%mm2, %%mm1\n"
1.487 + " psraw $2, %%mm1\n"
1.488 + " paddw (%1,%%ecx,2), %%mm1\n"
1.489 + " movq %%mm1, (%0,%%ecx,2)\n"
1.490 + " add $4, %%ecx\n"
1.491 + " cmp %4, %%ecx\n"
1.492 + " jl 1b\n"
1.493 + " emms\n"
1.494 + : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
1.495 + : "m" (n)
1.496 + : "ecx");
1.497 +}
1.498 +OIL_DEFINE_IMPL_FULL (lift_add_shift2_mmx, lift_add_shift2, OIL_IMPL_FLAG_MMX);
1.499 +
1.500 +void
1.501 +lift_sub_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
1.502 +{
1.503 + while (n&3) {
1.504 + d[0] = s1[0] - ((s2[0] + s3[0])>>2);
1.505 + d++;
1.506 + s1++;
1.507 + s2++;
1.508 + s3++;
1.509 + n--;
1.510 + }
1.511 + if (n==0) return;
1.512 +
1.513 + asm volatile ("\n"
1.514 + " xor %%ecx, %%ecx\n"
1.515 + "1:\n"
1.516 + " movq (%2,%%ecx,2), %%mm1\n"
1.517 + " movq (%3,%%ecx,2), %%mm2\n"
1.518 + " movq (%1,%%ecx,2), %%mm0\n"
1.519 + " paddw %%mm2, %%mm1\n"
1.520 + " psraw $2, %%mm1\n"
1.521 + " psubw %%mm1, %%mm0\n"
1.522 + " movq %%mm0, (%0,%%ecx,2)\n"
1.523 + " add $4, %%ecx\n"
1.524 + " cmp %4, %%ecx\n"
1.525 + " jl 1b\n"
1.526 + " emms\n"
1.527 + : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
1.528 + : "m" (n)
1.529 + : "ecx");
1.530 +}
1.531 +OIL_DEFINE_IMPL_FULL (lift_sub_shift2_mmx, lift_sub_shift2, OIL_IMPL_FLAG_MMX);
1.532 +
1.533 +#ifdef ENABLE_BROKEN_IMPLS
1.534 +void
1.535 +synth_53_mmx (int16_t *d_2xn, int16_t *s_2xn, int n)
1.536 +{
1.537 + int i;
1.538 +
1.539 + if (n==0) return;
1.540 + if (n == 1) {
1.541 + d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
1.542 + d_2xn[1] = s_2xn[1] + d_2xn[0];
1.543 + } else {
1.544 + int i;
1.545 +
1.546 + d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
1.547 +
1.548 + if (n > 6) {
1.549 + n-=5;
1.550 +
1.551 + asm volatile ("\n"
1.552 + " xor %%ecx, %%ecx\n"
1.553 + " movw 2(%1), %%ecx\n"
1.554 + " movd %%ecx, %%mm7\n"
1.555 + " movw 0(%0), %%ecx\n"
1.556 + " movd %%ecx, %%mm6\n"
1.557 + " movw 0(%1), %%ecx\n"
1.558 + " movd %%ecx, %%mm5\n"
1.559 +
1.560 + " xor %%ecx, %%ecx\n"
1.561 + "1:\n"
1.562 + " movq 4(%1,%%ecx,4), %%mm1\n" // mm1 = s5 s4 s3 s2
1.563 + " movq %%mm1, %%mm2\n" // mm2 = s5 s4 s3 s2
1.564 + " movq 12(%1,%%ecx,4), %%mm0\n" // mm0 = s9 s8 s7 s6
1.565 + " punpcklwd %%mm0, %%mm1\n" // mm1 = s7 s3 s6 s2
1.566 + " punpckhwd %%mm0, %%mm2\n" // mm2 = s9 s5 s8 s4
1.567 + " movq %%mm1, %%mm0\n" // mm0 = s7 s3 s6 s2
1.568 + " punpcklwd %%mm2, %%mm0\n" // mm0 = s8 s6 s4 s2
1.569 + " punpckhwd %%mm2, %%mm1\n" // mm1 = s9 s7 s5 s3
1.570 + //" movq %%mm0, %%mm3\n" // mm0 = s8 s6 s4 s2
1.571 +
1.572 + " movq %%mm1, %%mm2\n" // mm2 = s9 s7 s5 s3
1.573 + " psllq $16, %%mm2\n" // mm2 = s7 s5 s3 00
1.574 + " por %%mm7, %%mm2\n" // mm2 = s7 s5 s3 s1
1.575 + " movq %%mm2, %%mm4\n" // mm4 = s7 s5 s3 s1
1.576 + " paddw %%mm1, %%mm2\n" // mm2 = s9+s7 ...
1.577 + " psraw $2, %%mm2\n" // mm2 = (s9+s7)>>2 ...
1.578 + " movq %%mm1, %%mm7\n" // mm7 = s9 s7 s5 s3
1.579 + " psrlq $48, %%mm7\n" // mm7 = 00 00 00 s9
1.580 + " psubw %%mm2, %%mm0\n" // mm0 = d8 d6 d4 d2
1.581 +
1.582 + " movq %%mm0, %%mm1\n" // mm1 = d8 d6 d4 d2
1.583 + " movq %%mm0, %%mm3\n" // mm1 = d8 d6 d4 d2
1.584 + " psllq $16, %%mm0\n" // mm0 = d6 d4 d2 00
1.585 + " por %%mm6, %%mm0\n" // mm0 = d6 d4 d2 d0
1.586 + " psrlq $48, %%mm1\n" // mm1 = 00 00 00 d8
1.587 + " movq %%mm1, %%mm6\n" // mm6 = 00 00 00 d8
1.588 +
1.589 + " movq %%mm0, %%mm1\n"
1.590 + " paddw %%mm3, %%mm1\n" // mm0 = d8+d6 ...
1.591 + " psraw $1, %%mm1\n" // mm1 = (d8+d6)>>1 ...
1.592 + " paddw %%mm4, %%mm1\n" // mm1 = d7 d5 d3 d1
1.593 +
1.594 + " movq %%mm1, %%mm2\n"
1.595 +
1.596 + " movq %%mm0, %%mm1\n"
1.597 + " punpcklwd %%mm2, %%mm0\n"
1.598 + " punpckhwd %%mm2, %%mm1\n"
1.599 +
1.600 + " movq %%mm0, (%0, %%ecx, 4)\n"
1.601 + " movq %%mm1, 8(%0, %%ecx, 4)\n"
1.602 +
1.603 + " add $4, %%ecx\n"
1.604 + " cmp %3, %%ecx\n"
1.605 + " jl 1b\n"
1.606 + " emms\n"
1.607 + : "+r" (d_2xn), "+r" (s_2xn), "+ecx" (i)
1.608 + : "m" (n));
1.609 +
1.610 + i*=2;
1.611 + n+=5;
1.612 + d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
1.613 + i+=2;
1.614 + } else {
1.615 + i = 2;
1.616 + }
1.617 + for(;i<n*2-2;i+=2){
1.618 + d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
1.619 + d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
1.620 + }
1.621 + d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
1.622 + d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
1.623 + d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
1.624 + }
1.625 +}
1.626 +OIL_DEFINE_IMPL_FULL (synth_53_mmx, synth_53, OIL_IMPL_FLAG_MMX);
1.627 +#endif
1.628 +
1.629 +
1.630 +void
1.631 +mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.632 + int16_t *s4_2, int n)
1.633 +{
1.634 + int shift = s4_2[1];
1.635 +
1.636 + while (n&3) {
1.637 + int x;
1.638 +
1.639 + x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
1.640 + x >>= s4_2[1];
1.641 + d1[0] = s1[0] + x;
1.642 +
1.643 + d1++;
1.644 + s1++;
1.645 + s2++;
1.646 + n--;
1.647 + }
1.648 + if (n==0) return;
1.649 +
1.650 + n>>=2;
1.651 + asm volatile ("\n"
1.652 + " movzwl 0(%0), %%ecx\n"
1.653 + " movd %%ecx, %%mm7\n"
1.654 + " pshufw $0x00, %%mm7, %%mm7\n"
1.655 + " movzwl 2(%0), %%ecx\n"
1.656 + " movd %%ecx, %%mm6\n"
1.657 + " pshufw $0x00, %%mm6, %%mm6\n"
1.658 + " movzwl 0(%1), %%ecx\n"
1.659 + " movd %%ecx, %%mm5\n"
1.660 + " pshufw $0x44, %%mm5, %%mm5\n"
1.661 + :: "r" (s3_2), "r" (s4_2)
1.662 + : "ecx"
1.663 + );
1.664 + asm volatile ("\n"
1.665 + "1:\n"
1.666 + " movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
1.667 + " movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
1.668 + " pmullw %%mm7, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
1.669 + " pmulhw %%mm7, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
1.670 + " movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
1.671 + " punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
1.672 + " punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
1.673 + " movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
1.674 +
1.675 + " movq 2(%2), %%mm2\n"
1.676 + " movq 2(%2), %%mm3\n"
1.677 + " pmullw %%mm6, %%mm2\n"
1.678 + " pmulhw %%mm6, %%mm3\n"
1.679 + " movq %%mm2, %%mm4\n"
1.680 + " punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
1.681 + " punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
1.682 + " movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
1.683 +
1.684 + " paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
1.685 + " paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
1.686 +
1.687 + " paddd %%mm5, %%mm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
1.688 + " paddd %%mm5, %%mm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
1.689 +
1.690 + " movd %4, %%mm4\n"
1.691 + " psrad %%mm4, %%mm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
1.692 + " psrad %%mm4, %%mm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
1.693 +
1.694 + " packssdw %%mm1, %%mm0\n"
1.695 + " paddw 0(%1), %%mm0\n"
1.696 + " movq %%mm0, 0(%0)\n"
1.697 + " add $8, %0\n"
1.698 + " add $8, %1\n"
1.699 + " add $8, %2\n"
1.700 + " decl %3\n"
1.701 + " jnz 1b\n"
1.702 + " emms\n"
1.703 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.704 + : "r" (shift)
1.705 + );
1.706 +}
1.707 +OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.708 +
1.709 +#if 0
1.710 +void
1.711 +mas2_add_s16_lim_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.712 + int16_t *s4_2, int n)
1.713 +{
1.714 + int shift = s4_2[1];
1.715 +
1.716 + while (n&3) {
1.717 + int x;
1.718 +
1.719 + x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
1.720 + x >>= s4_2[1];
1.721 + d1[0] = s1[0] + x;
1.722 +
1.723 + d1++;
1.724 + s1++;
1.725 + s2++;
1.726 + n--;
1.727 + }
1.728 + if (n==0) return;
1.729 +
1.730 + n>>=2;
1.731 + asm volatile ("\n"
1.732 + " movzwl 0(%0), %%ecx\n"
1.733 + " movd %%ecx, %%mm7\n"
1.734 + " pshufw $0x00, %%mm7, %%mm7\n"
1.735 + " movzwl 2(%0), %%ecx\n"
1.736 + " movd %%ecx, %%mm6\n"
1.737 + " pshufw $0x00, %%mm6, %%mm6\n"
1.738 + " movzwl 0(%1), %%ecx\n"
1.739 + " movd %%ecx, %%mm5\n"
1.740 + " pshufw $0x44, %%mm5, %%mm5\n"
1.741 + :: "r" (s3_2), "r" (s4_2)
1.742 + : "ecx"
1.743 + );
1.744 + asm volatile ("\n"
1.745 + "1:\n"
1.746 + " movq 0(%2), %%mm0\n"
1.747 + " paddq 2(%2), %%mm0\n"
1.748 +
1.749 + " movd %4, %%mm4\n"
1.750 + " psraw %%mm4, %%mm0\n"
1.751 +
1.752 + " paddw 0(%1), %%mm0\n"
1.753 + " movq %%mm0, 0(%0)\n"
1.754 + " add $8, %0\n"
1.755 + " add $8, %1\n"
1.756 + " add $8, %2\n"
1.757 + " decl %3\n"
1.758 + " jnz 1b\n"
1.759 + " emms\n"
1.760 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.761 + : "r" (shift)
1.762 + );
1.763 +}
1.764 +OIL_DEFINE_IMPL_FULL (mas2_add_s16_lim_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.765 +#endif
1.766 +
1.767 +void
1.768 +mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_4,
1.769 + int16_t *s4_2, int n)
1.770 +{
1.771 + int shift = s4_2[1];
1.772 + //int m;
1.773 +
1.774 + //m = n&3;
1.775 +#if 1
1.776 + while (n&3) {
1.777 + int x;
1.778 + int i;
1.779 +
1.780 + x = s4_2[0];
1.781 + for(i=0;i<4;i++){
1.782 + x += s2[i]*s3_4[i];
1.783 + }
1.784 + x >>= s4_2[1];
1.785 + d1[0] = s1[0] + x;
1.786 +
1.787 + d1++;
1.788 + s1++;
1.789 + s2++;
1.790 + n--;
1.791 + }
1.792 +#endif
1.793 + if (n==0) return;
1.794 +
1.795 + n>>=2;
1.796 + asm volatile ("\n"
1.797 + " movq 0(%0), %%mm7\n"
1.798 + " movzwl 0(%1), %%ecx\n"
1.799 + " movd %%ecx, %%mm5\n"
1.800 + " pshufw $0x44, %%mm5, %%mm5\n"
1.801 + :: "r" (s3_4), "r" (s4_2)
1.802 + : "ecx"
1.803 + );
1.804 + asm volatile ("\n"
1.805 + "1:\n"
1.806 + " movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
1.807 + " movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
1.808 + " pshufw $0x00, %%mm7, %%mm6\n"
1.809 + " pmullw %%mm6, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
1.810 + " pmulhw %%mm6, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
1.811 + " movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
1.812 + " punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
1.813 + " punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
1.814 + " movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
1.815 +
1.816 + " movq 2(%2), %%mm2\n"
1.817 + " movq 2(%2), %%mm3\n"
1.818 + " pshufw $0x55, %%mm7, %%mm6\n"
1.819 + " pmullw %%mm6, %%mm2\n"
1.820 + " pmulhw %%mm6, %%mm3\n"
1.821 + " movq %%mm2, %%mm4\n"
1.822 + " punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
1.823 + " punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
1.824 + " movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
1.825 + " paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
1.826 + " paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
1.827 +
1.828 + " movq 4(%2), %%mm2\n"
1.829 + " movq 4(%2), %%mm3\n"
1.830 + " pshufw $0xaa, %%mm7, %%mm6\n"
1.831 + " pmullw %%mm6, %%mm2\n"
1.832 + " pmulhw %%mm6, %%mm3\n"
1.833 + " movq %%mm2, %%mm4\n"
1.834 + " punpcklwd %%mm3, %%mm2\n"
1.835 + " punpckhwd %%mm3, %%mm4\n"
1.836 + " movq %%mm4, %%mm3\n"
1.837 + " paddd %%mm3, %%mm1\n"
1.838 + " paddd %%mm2, %%mm0\n"
1.839 +
1.840 + " movq 6(%2), %%mm2\n"
1.841 + " movq 6(%2), %%mm3\n"
1.842 + " pshufw $0xff, %%mm7, %%mm6\n"
1.843 + " pmullw %%mm6, %%mm2\n"
1.844 + " pmulhw %%mm6, %%mm3\n"
1.845 + " movq %%mm2, %%mm4\n"
1.846 + " punpcklwd %%mm3, %%mm2\n"
1.847 + " punpckhwd %%mm3, %%mm4\n"
1.848 + " movq %%mm4, %%mm3\n"
1.849 + " paddd %%mm3, %%mm1\n"
1.850 + " paddd %%mm2, %%mm0\n"
1.851 +
1.852 + " paddd %%mm5, %%mm1\n"
1.853 + " paddd %%mm5, %%mm0\n"
1.854 +
1.855 + " movd %4, %%mm4\n"
1.856 + " psrad %%mm4, %%mm1\n"
1.857 + " psrad %%mm4, %%mm0\n"
1.858 +
1.859 + " packssdw %%mm1, %%mm0\n"
1.860 + " paddw 0(%1), %%mm0\n"
1.861 + " movq %%mm0, 0(%0)\n"
1.862 + " add $8, %0\n"
1.863 + " add $8, %1\n"
1.864 + " add $8, %2\n"
1.865 + " decl %3\n"
1.866 + " jnz 1b\n"
1.867 + " emms\n"
1.868 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.869 + : "r" (shift)
1.870 + );
1.871 +#if 0
1.872 + while (m) {
1.873 + int x;
1.874 + int i;
1.875 +
1.876 + x = s4_2[0];
1.877 + for(i=0;i<4;i++){
1.878 + x += s2[i]*s3_4[i];
1.879 + }
1.880 + x >>= s4_2[1];
1.881 + d1[0] = s1[0] + x;
1.882 +
1.883 + d1++;
1.884 + s1++;
1.885 + s2++;
1.886 + m--;
1.887 + }
1.888 +#endif
1.889 +}
1.890 +OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.891 +
1.892 +#if 0
1.893 +/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
1.894 +void
1.895 +mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.896 + int16_t *s4_2, int n)
1.897 +{
1.898 + while (n&3) {
1.899 + int x;
1.900 +
1.901 + x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
1.902 + x >>= s4_2[1];
1.903 + d1[0] = s1[0] + x;
1.904 +
1.905 + d1++;
1.906 + s1++;
1.907 + s2++;
1.908 + n--;
1.909 + }
1.910 + if (n==0) return;
1.911 +
1.912 + n>>=2;
1.913 + asm volatile ("\n"
1.914 + " movzwl 0(%0), %%ecx\n"
1.915 + " movd %%ecx, %%mm7\n"
1.916 + " pshufw $0x00, %%mm7, %%mm7\n"
1.917 + " movzwl 2(%0), %%ecx\n"
1.918 + " movd %%ecx, %%mm6\n"
1.919 + " pshufw $0x00, %%mm6, %%mm6\n"
1.920 + " movzwl 0(%1), %%ecx\n"
1.921 + " movd %%ecx, %%mm5\n"
1.922 + " pshufw $0x00, %%mm5, %%mm5\n"
1.923 + " movzwl 2(%1), %%ecx\n"
1.924 + " movd %%ecx, %%mm4\n"
1.925 + :: "r" (s3_2), "r" (s4_2)
1.926 + : "ecx"
1.927 + );
1.928 + asm volatile ("\n"
1.929 + "1:\n"
1.930 + " movq 0(%2), %%mm0\n"
1.931 + " pmullw %%mm7, %%mm0\n"
1.932 + " movq 2(%2), %%mm1\n"
1.933 + " pmullw %%mm6, %%mm1\n"
1.934 + " paddw %%mm1, %%mm0\n"
1.935 + " paddw %%mm5, %%mm0\n"
1.936 + " psraw %%mm4, %%mm0\n"
1.937 + " paddw 0(%1), %%mm0\n"
1.938 + " movq %%mm0, 0(%0)\n"
1.939 + " add $8, %0\n"
1.940 + " add $8, %1\n"
1.941 + " add $8, %2\n"
1.942 + " decl %3\n"
1.943 + " jnz 1b\n"
1.944 + " emms\n"
1.945 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.946 + );
1.947 +}
1.948 +OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.949 +#endif
1.950 +
1.951 +
1.952 +#if 0
1.953 +/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
1.954 +void
1.955 +mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.956 + int16_t *s4_2, int n)
1.957 +{
1.958 + while (n&3) {
1.959 + int x;
1.960 +
1.961 + x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1] +
1.962 + s2[2]*s3_2[2] + s2[2]*s3_2[2];
1.963 + x >>= s4_2[1];
1.964 + d1[0] = s1[0] + x;
1.965 +
1.966 + d1++;
1.967 + s1++;
1.968 + s2++;
1.969 + n--;
1.970 + }
1.971 + if (n==0) return;
1.972 +
1.973 + n>>=2;
1.974 + asm volatile ("\n"
1.975 + " movzwl 0(%0), %%ecx\n"
1.976 + " movd %%ecx, %%mm7\n"
1.977 + " pshufw $0x00, %%mm7, %%mm7\n"
1.978 + " movzwl 2(%0), %%ecx\n"
1.979 + " movd %%ecx, %%mm6\n"
1.980 + " pshufw $0x00, %%mm6, %%mm6\n"
1.981 + " movzwl 2(%0), %%ecx\n"
1.982 + " movd %%ecx, %%mm5\n"
1.983 + " pshufw $0x00, %%mm5, %%mm5\n"
1.984 + " movzwl 2(%0), %%ecx\n"
1.985 + " movd %%ecx, %%mm4\n"
1.986 + " pshufw $0x00, %%mm4, %%mm4\n"
1.987 + " movzwl 0(%1), %%ecx\n"
1.988 + " movd %%ecx, %%mm3\n"
1.989 + " pshufw $0x00, %%mm3, %%mm3\n"
1.990 + " movzwl 2(%1), %%ecx\n"
1.991 + " movd %%ecx, %%mm2\n"
1.992 + :: "r" (s3_2), "r" (s4_2)
1.993 + : "ecx"
1.994 + );
1.995 + asm volatile ("\n"
1.996 + "1:\n"
1.997 + " movq 0(%2), %%mm0\n"
1.998 + " pmullw %%mm7, %%mm0\n"
1.999 + " movq 2(%2), %%mm1\n"
1.1000 + " pmullw %%mm6, %%mm1\n"
1.1001 + " paddw %%mm1, %%mm0\n"
1.1002 + " movq 4(%2), %%mm1\n"
1.1003 + " pmullw %%mm5, %%mm1\n"
1.1004 + " paddw %%mm1, %%mm0\n"
1.1005 + " movq 6(%2), %%mm1\n"
1.1006 + " pmullw %%mm4, %%mm1\n"
1.1007 + " paddw %%mm1, %%mm0\n"
1.1008 + " paddw %%mm3, %%mm0\n"
1.1009 + " psraw %%mm2, %%mm0\n"
1.1010 + " paddw 0(%1), %%mm0\n"
1.1011 + " movq %%mm0, 0(%0)\n"
1.1012 + " add $8, %0\n"
1.1013 + " add $8, %1\n"
1.1014 + " add $8, %2\n"
1.1015 + " decl %3\n"
1.1016 + " jnz 1b\n"
1.1017 + " emms\n"
1.1018 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1019 + );
1.1020 +}
1.1021 +OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1022 +#endif
1.1023 +
1.1024 +
1.1025 +#if 0
1.1026 +/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
1.1027 +void
1.1028 +mas8_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1029 + int16_t *s4_2, int n)
1.1030 +{
1.1031 + while (n&3) {
1.1032 + int x;
1.1033 + int i;
1.1034 +
1.1035 + x = s4_2[0];
1.1036 + for(i=0;i<8;i++){
1.1037 + x += s2[i]*s3_2[i];
1.1038 + }
1.1039 + x >>= s4_2[1];
1.1040 + d1[0] = s1[0] + x;
1.1041 +
1.1042 + d1++;
1.1043 + s1++;
1.1044 + s2++;
1.1045 + n--;
1.1046 + }
1.1047 + if (n==0) return;
1.1048 +
1.1049 + n>>=2;
1.1050 + asm volatile ("\n"
1.1051 + " movq 0(%0), %%mm6\n"
1.1052 + " movq 8(%0), %%mm7\n"
1.1053 + " movzwl 0(%1), %%ecx\n"
1.1054 + " movd %%ecx, %%mm3\n"
1.1055 + " pshufw $0x00, %%mm3, %%mm3\n"
1.1056 + " pxor %%mm4, %%mm4\n"
1.1057 + " movzwl 2(%1), %%ecx\n"
1.1058 + " movd %%ecx, %%mm4\n"
1.1059 + :: "r" (s3_2), "r" (s4_2)
1.1060 + : "ecx"
1.1061 + );
1.1062 + asm volatile ("\n"
1.1063 + "1:\n"
1.1064 + " pshufw $0x00, %%mm6, %%mm1\n"
1.1065 + " movq 0(%2), %%mm0\n"
1.1066 + " pmullw %%mm1, %%mm0\n"
1.1067 + " pshufw $0x55, %%mm6, %%mm2\n"
1.1068 + " movq 2(%2), %%mm1\n"
1.1069 + " pmullw %%mm2, %%mm1\n"
1.1070 + " paddw %%mm1, %%mm0\n"
1.1071 + " pshufw $0xaa, %%mm6, %%mm2\n"
1.1072 + " movq 4(%2), %%mm1\n"
1.1073 + " pmullw %%mm2, %%mm1\n"
1.1074 + " paddw %%mm1, %%mm0\n"
1.1075 + " pshufw $0xff, %%mm6, %%mm2\n"
1.1076 + " movq 6(%2), %%mm1\n"
1.1077 + " pmullw %%mm2, %%mm1\n"
1.1078 + " paddw %%mm1, %%mm0\n"
1.1079 +
1.1080 + " pshufw $0x00, %%mm7, %%mm2\n"
1.1081 + " movq 8(%2), %%mm1\n"
1.1082 + " pmullw %%mm2, %%mm1\n"
1.1083 + " paddw %%mm1, %%mm0\n"
1.1084 + " pshufw $0x55, %%mm7, %%mm2\n"
1.1085 + " movq 10(%2), %%mm1\n"
1.1086 + " pmullw %%mm2, %%mm1\n"
1.1087 + " paddw %%mm1, %%mm0\n"
1.1088 + " pshufw $0xaa, %%mm7, %%mm2\n"
1.1089 + " movq 12(%2), %%mm1\n"
1.1090 + " pmullw %%mm2, %%mm1\n"
1.1091 + " paddw %%mm1, %%mm0\n"
1.1092 + " pshufw $0xff, %%mm7, %%mm2\n"
1.1093 + " movq 14(%2), %%mm1\n"
1.1094 + " pmullw %%mm2, %%mm1\n"
1.1095 + " paddw %%mm1, %%mm0\n"
1.1096 +
1.1097 + " paddw %%mm3, %%mm0\n"
1.1098 + " psraw %%mm4, %%mm0\n"
1.1099 + " paddw 0(%1), %%mm0\n"
1.1100 + " movq %%mm0, 0(%0)\n"
1.1101 + " add $8, %0\n"
1.1102 + " add $8, %1\n"
1.1103 + " add $8, %2\n"
1.1104 + " decl %3\n"
1.1105 + " jnz 1b\n"
1.1106 + " emms\n"
1.1107 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1108 + );
1.1109 +}
1.1110 +OIL_DEFINE_IMPL_FULL (mas8_add_s16_mmx, mas8_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1111 +#endif
1.1112 +
1.1113 +
1.1114 +void
1.1115 +mas4_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1116 + int16_t *s4_2, int n)
1.1117 +{
1.1118 + if (n==0) return;
1.1119 + asm volatile ("\n"
1.1120 + " movq 0(%0), %%mm6\n"
1.1121 + " movzwl 0(%1), %%ecx\n"
1.1122 + " movd %%ecx, %%mm3\n"
1.1123 + " movzwl 2(%1), %%ecx\n"
1.1124 + " movd %%ecx, %%mm4\n"
1.1125 + :: "r" (s3_2), "r" (s4_2)
1.1126 + : "ecx"
1.1127 + );
1.1128 + asm volatile ("\n"
1.1129 + "1:\n"
1.1130 + " movq 0(%2), %%mm0\n"
1.1131 + " pmaddwd %%mm6, %%mm0\n"
1.1132 + " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
1.1133 + " paddd %%mm1, %%mm0\n"
1.1134 + " paddd %%mm3, %%mm0\n"
1.1135 + " psrad %%mm4, %%mm0\n"
1.1136 + " movd %%mm0, %%eax\n"
1.1137 + " addw 0(%1), %%ax\n"
1.1138 + " movw %%ax, 0(%0)\n"
1.1139 + " add $2, %0\n"
1.1140 + " add $2, %1\n"
1.1141 + " add $2, %2\n"
1.1142 + " decl %3\n"
1.1143 + " jnz 1b\n"
1.1144 + " emms\n"
1.1145 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1146 + :
1.1147 + : "eax"
1.1148 + );
1.1149 +}
1.1150 +OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1151 +
1.1152 +void
1.1153 +mas4_add_s16_pmaddwd_2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1154 + int16_t *s4_2, int n)
1.1155 +{
1.1156 + if (n==0) return;
1.1157 + asm volatile ("\n"
1.1158 + " movq 0(%0), %%mm6\n"
1.1159 + " movzwl 0(%1), %%ecx\n"
1.1160 + " movd %%ecx, %%mm3\n"
1.1161 + " pshufw $0x44, %%mm3, %%mm3\n" // 01 00 01 00
1.1162 + " movzwl 2(%1), %%ecx\n"
1.1163 + " movd %%ecx, %%mm4\n"
1.1164 + :: "r" (s3_2), "r" (s4_2)
1.1165 + : "ecx"
1.1166 + );
1.1167 + if (n&1) {
1.1168 + asm volatile ("\n"
1.1169 + " movq 0(%2), %%mm0\n"
1.1170 + " pmaddwd %%mm6, %%mm0\n"
1.1171 + " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
1.1172 + " paddd %%mm1, %%mm0\n"
1.1173 + " paddd %%mm3, %%mm0\n"
1.1174 + " psrad %%mm4, %%mm0\n"
1.1175 + " movd %%mm0, %%eax\n"
1.1176 + " addw 0(%1), %%ax\n"
1.1177 + " movw %%ax, 0(%0)\n"
1.1178 + " add $2, %0\n"
1.1179 + " add $2, %1\n"
1.1180 + " add $2, %2\n"
1.1181 + " decl %3\n"
1.1182 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1183 + :
1.1184 + : "eax"
1.1185 + );
1.1186 + }
1.1187 + n>>=1;
1.1188 + asm volatile ("\n"
1.1189 + "1:\n"
1.1190 + " movq 0(%2), %%mm0\n"
1.1191 + " pmaddwd %%mm6, %%mm0\n"
1.1192 + " movq 2(%2), %%mm2\n"
1.1193 + " pmaddwd %%mm6, %%mm2\n"
1.1194 +
1.1195 + " movq %%mm0, %%mm1\n"
1.1196 + " punpckhdq %%mm2, %%mm0\n"
1.1197 + " punpckldq %%mm2, %%mm1\n"
1.1198 +
1.1199 + " paddd %%mm1, %%mm0\n"
1.1200 + " paddd %%mm3, %%mm0\n"
1.1201 + " psrad %%mm4, %%mm0\n"
1.1202 + " pshufw $0xd8, %%mm0, %%mm0\n" // 11 01 10 00
1.1203 +
1.1204 + " paddw 0(%1), %%mm0\n"
1.1205 + " movd %%mm0, 0(%0)\n"
1.1206 + " add $4, %0\n"
1.1207 + " add $4, %1\n"
1.1208 + " add $4, %2\n"
1.1209 + " decl %3\n"
1.1210 + " jnz 1b\n"
1.1211 + " emms\n"
1.1212 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1213 + :
1.1214 + : "eax"
1.1215 + );
1.1216 +}
1.1217 +OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd_2, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1218 +
1.1219 +void
1.1220 +mas8_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1221 + int16_t *s4_2, int n)
1.1222 +{
1.1223 + if (n==0) return;
1.1224 + asm volatile ("\n"
1.1225 + " movq 0(%0), %%mm6\n"
1.1226 + " movq 8(%0), %%mm7\n"
1.1227 + " movzwl 0(%1), %%ecx\n"
1.1228 + " movd %%ecx, %%mm3\n"
1.1229 + " movzwl 2(%1), %%ecx\n"
1.1230 + " movd %%ecx, %%mm4\n"
1.1231 + :: "r" (s3_2), "r" (s4_2)
1.1232 + : "ecx"
1.1233 + );
1.1234 + asm volatile ("\n"
1.1235 + "1:\n"
1.1236 + " movq 0(%2), %%mm0\n"
1.1237 + " pmaddwd %%mm6, %%mm0\n"
1.1238 + " movq 8(%2), %%mm1\n"
1.1239 + " pmaddwd %%mm7, %%mm1\n"
1.1240 + " paddd %%mm1, %%mm0\n"
1.1241 + " pshufw $0xee, %%mm0, %%mm1\n"
1.1242 + " paddd %%mm1, %%mm0\n"
1.1243 + " paddd %%mm3, %%mm0\n"
1.1244 + " psrad %%mm4, %%mm0\n"
1.1245 + " movd %%mm0, %%eax\n"
1.1246 + " addw 0(%1), %%ax\n"
1.1247 + " movw %%ax, 0(%0)\n"
1.1248 + " add $2, %0\n"
1.1249 + " add $2, %1\n"
1.1250 + " add $2, %2\n"
1.1251 + " decl %3\n"
1.1252 + " jnz 1b\n"
1.1253 + " emms\n"
1.1254 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1255 + :
1.1256 + : "eax"
1.1257 + );
1.1258 +}
1.1259 +OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd, mas8_add_s16, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
1.1260 +
1.1261 +
1.1262 +
1.1263 +#if 0
1.1264 +void
1.1265 +mas8_add_s16_pmaddwd2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1266 + int16_t *s4_2, int n)
1.1267 +{
1.1268 + while (n&3) {
1.1269 + int x;
1.1270 + int i;
1.1271 +
1.1272 + x = s4_2[0];
1.1273 + for(i=0;i<8;i++){
1.1274 + x += s2[i]*s3_2[i];
1.1275 + }
1.1276 + x >>= s4_2[1];
1.1277 + d1[0] = s1[0] + x;
1.1278 +
1.1279 + d1++;
1.1280 + s1++;
1.1281 + s2++;
1.1282 + n--;
1.1283 + }
1.1284 + if (n==0) return;
1.1285 +
1.1286 + n>>=2;
1.1287 + asm volatile ("\n"
1.1288 + " movq 0(%0), %%mm6\n"
1.1289 + " movq 8(%0), %%mm7\n"
1.1290 + " movzwl 0(%1), %%ecx\n"
1.1291 + " movd %%ecx, %%mm5\n"
1.1292 + " pshufw $0x00, %%mm5, %%mm5\n"
1.1293 + " pxor %%mm4, %%mm4\n"
1.1294 + " movzwl 2(%1), %%ecx\n"
1.1295 + " movd %%ecx, %%mm4\n"
1.1296 + :: "r" (s3_2), "r" (s4_2)
1.1297 + : "ecx"
1.1298 + );
1.1299 + asm volatile ("\n"
1.1300 + "1:\n"
1.1301 + " movq 0(%2), %%mm0\n"
1.1302 + " pmaddwd %%mm6, %%mm0\n"
1.1303 + " movq 8(%2), %%mm1\n"
1.1304 + " pmaddwd %%mm7, %%mm1\n"
1.1305 + " paddd %%mm1, %%mm0\n"
1.1306 + " pshufw $0xee, %%mm0, %%mm1\n"
1.1307 + " paddw %%mm1, %%mm0\n"
1.1308 +
1.1309 + " movq 2(%2), %%mm2\n"
1.1310 + " pmaddwd %%mm6, %%mm2\n"
1.1311 + " movq 10(%2), %%mm3\n"
1.1312 + " pmaddwd %%mm7, %%mm3\n"
1.1313 + " paddd %%mm3, %%mm2\n"
1.1314 + " pshufw $0xee, %%mm2, %%mm3\n"
1.1315 + " paddw %%mm3, %%mm2\n"
1.1316 + " pextrw $0, %%mm2, %%eax\n"
1.1317 + " pinsrw $1, %%eax, %%mm0\n"
1.1318 +
1.1319 + " movq 4(%2), %%mm2\n"
1.1320 + " pmaddwd %%mm6, %%mm2\n"
1.1321 + " movq 12(%2), %%mm3\n"
1.1322 + " pmaddwd %%mm7, %%mm3\n"
1.1323 + " paddd %%mm3, %%mm2\n"
1.1324 + " pshufw $0xee, %%mm2, %%mm3\n"
1.1325 + " paddw %%mm3, %%mm2\n"
1.1326 + " pextrw $0, %%mm2, %%eax\n"
1.1327 + " pinsrw $2, %%eax, %%mm0\n"
1.1328 +
1.1329 + " movq 6(%2), %%mm2\n"
1.1330 + " pmaddwd %%mm6, %%mm2\n"
1.1331 + " movq 14(%2), %%mm3\n"
1.1332 + " pmaddwd %%mm7, %%mm3\n"
1.1333 + " paddd %%mm3, %%mm2\n"
1.1334 + " pshufw $0xee, %%mm2, %%mm3\n"
1.1335 + " paddw %%mm3, %%mm2\n"
1.1336 + " pextrw $0, %%mm2, %%eax\n"
1.1337 + " pinsrw $3, %%eax, %%mm0\n"
1.1338 +
1.1339 + " paddw %%mm5, %%mm0\n"
1.1340 + " psraw %%mm4, %%mm0\n"
1.1341 + " paddw 0(%1), %%mm0\n"
1.1342 + " movq %%mm0, 0(%0)\n"
1.1343 + " add $8, %0\n"
1.1344 + " add $8, %1\n"
1.1345 + " add $8, %2\n"
1.1346 + " decl %3\n"
1.1347 + " jnz 1b\n"
1.1348 + " emms\n"
1.1349 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1350 + :
1.1351 + : "eax"
1.1352 + );
1.1353 +}
1.1354 +OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
1.1355 +#endif
1.1356 +
1.1357 +#if 0
1.1358 +/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
1.1359 +void
1.1360 +mas8_add_s16_sse2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
1.1361 + int16_t *s4_2, int n)
1.1362 +{
1.1363 + asm volatile ("\n"
1.1364 + " movq 0(%0), %%mm6\n"
1.1365 + " movq 8(%0), %%mm7\n"
1.1366 + " movzwl 0(%1), %%ecx\n"
1.1367 + " movd %%ecx, %%mm3\n"
1.1368 + " pshufw $0x00, %%mm3, %%mm3\n"
1.1369 + " pxor %%mm4, %%mm4\n"
1.1370 + " movzwl 2(%1), %%ecx\n"
1.1371 + " movd %%ecx, %%mm4\n"
1.1372 + :: "r" (s3_2), "r" (s4_2)
1.1373 + : "ecx"
1.1374 + );
1.1375 + asm volatile ("\n"
1.1376 + "1:\n"
1.1377 + " movq 0(%2), %%mm0\n"
1.1378 + " pmullw %%mm6, %%mm0\n"
1.1379 + " movq 8(%2), %%mm1\n"
1.1380 + " pmullw %%mm7, %%mm1\n"
1.1381 + " paddw %%mm1, %%mm0\n"
1.1382 + " pshufw $0xee, %%mm0, %%mm1\n"
1.1383 + " paddw %%mm1, %%mm0\n"
1.1384 + " pshufw $0x01, %%mm0, %%mm1\n"
1.1385 + " paddw %%mm1, %%mm0\n"
1.1386 + " paddw %%mm3, %%mm0\n"
1.1387 + " psraw %%mm4, %%mm0\n"
1.1388 + " movd %%mm0, %%eax\n"
1.1389 + " addw 0(%1), %%ax\n"
1.1390 + " movw %%ax, 0(%0)\n"
1.1391 + " add $2, %0\n"
1.1392 + " add $2, %1\n"
1.1393 + " add $2, %2\n"
1.1394 + " decl %3\n"
1.1395 + " jnz 1b\n"
1.1396 + " emms\n"
1.1397 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
1.1398 + :
1.1399 + : "eax"
1.1400 + );
1.1401 +}
1.1402 +OIL_DEFINE_IMPL_FULL (mas8_add_s16_sse2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
1.1403 +#endif
1.1404 +
1.1405 +void
1.1406 +mas2_across_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
1.1407 + int16_t *s4_2, int16_t *s5_2, int n)
1.1408 +{
1.1409 + int shift = s5_2[1];
1.1410 +
1.1411 + while (n&3) {
1.1412 + int x;
1.1413 +
1.1414 + x = s5_2[0] + s2[0]*s4_2[0] + s3[0]*s4_2[1];
1.1415 + x >>= s5_2[1];
1.1416 + d1[0] = s1[0] + x;
1.1417 +
1.1418 + d1++;
1.1419 + s1++;
1.1420 + s2++;
1.1421 + s3++;
1.1422 + n--;
1.1423 + }
1.1424 + if (n==0) return;
1.1425 +
1.1426 + n>>=2;
1.1427 + if (n==0) return;
1.1428 + asm volatile ("\n"
1.1429 + " movzwl 0(%0), %%ecx\n"
1.1430 + " movd %%ecx, %%mm7\n"
1.1431 + " pshufw $0x00, %%mm7, %%mm7\n"
1.1432 + " movzwl 2(%0), %%ecx\n"
1.1433 + " movd %%ecx, %%mm6\n"
1.1434 + " pshufw $0x00, %%mm6, %%mm6\n"
1.1435 + " movzwl 0(%1), %%ecx\n"
1.1436 + " movd %%ecx, %%mm5\n"
1.1437 + " pshufw $0x44, %%mm5, %%mm5\n"
1.1438 + :: "r" (s4_2), "r" (s5_2)
1.1439 + : "ecx"
1.1440 + );
1.1441 + asm volatile ("\n"
1.1442 + "1:\n"
1.1443 + " movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
1.1444 + " movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
1.1445 + " pmullw %%mm7, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
1.1446 + " pmulhw %%mm7, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
1.1447 + " movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
1.1448 + " punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
1.1449 + " punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
1.1450 + " movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
1.1451 +
1.1452 + " movq 0(%3), %%mm2\n"
1.1453 + " movq 0(%3), %%mm3\n"
1.1454 + " pmullw %%mm6, %%mm2\n"
1.1455 + " pmulhw %%mm6, %%mm3\n"
1.1456 + " movq %%mm2, %%mm4\n"
1.1457 + " punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
1.1458 + " punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
1.1459 + " movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
1.1460 +
1.1461 + " paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
1.1462 + " paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
1.1463 +
1.1464 + " paddd %%mm5, %%mm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
1.1465 + " paddd %%mm5, %%mm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
1.1466 +
1.1467 + " movd %5, %%mm4\n"
1.1468 + " psrad %%mm4, %%mm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
1.1469 + " psrad %%mm4, %%mm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
1.1470 +
1.1471 + " packssdw %%mm1, %%mm0\n"
1.1472 + " paddw 0(%1), %%mm0\n"
1.1473 + " movq %%mm0, 0(%0)\n"
1.1474 + " add $8, %0\n"
1.1475 + " add $8, %1\n"
1.1476 + " add $8, %2\n"
1.1477 + " add $8, %3\n"
1.1478 + " decl %4\n"
1.1479 + " jnz 1b\n"
1.1480 + " emms\n"
1.1481 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+m" (n)
1.1482 + : "r" (shift)
1.1483 + );
1.1484 +}
1.1485 +OIL_DEFINE_IMPL_FULL (mas2_across_add_s16_mmx, mas2_across_add_s16,
1.1486 + OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1487 +
1.1488 +void
1.1489 +add_const_rshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2_2, int n)
1.1490 +{
1.1491 + while(n&3) {
1.1492 + d1[0] = (s1[0] + s2_2[0])>>s2_2[1];
1.1493 + d1++;
1.1494 + s1++;
1.1495 + n--;
1.1496 + }
1.1497 + n>>=2;
1.1498 + if (n==0) return;
1.1499 + asm volatile ("\n"
1.1500 + " movzwl 0(%2), %%ecx\n"
1.1501 + " movd %%ecx, %%mm7\n"
1.1502 + " pshufw $0x00, %%mm7, %%mm7\n"
1.1503 + " movzwl 2(%2), %%ecx\n"
1.1504 + " movd %%ecx, %%mm6\n"
1.1505 + "1:\n"
1.1506 + " movq 0(%1), %%mm0\n"
1.1507 + " paddsw %%mm7, %%mm0\n"
1.1508 + " psraw %%mm6, %%mm0\n"
1.1509 + " movq %%mm0, 0(%0)\n"
1.1510 + " add $8, %0\n"
1.1511 + " add $8, %1\n"
1.1512 + " decl %3\n"
1.1513 + " jnz 1b\n"
1.1514 + " emms\n"
1.1515 + : "+r" (d1), "+r" (s1), "+r" (s2_2), "+r" (n)
1.1516 + :
1.1517 + : "ecx"
1.1518 + );
1.1519 +
1.1520 +}
1.1521 +OIL_DEFINE_IMPL_FULL (add_const_rshift_s16_mmx, add_const_rshift_s16,
1.1522 + OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
1.1523 +
1.1524 +void
1.1525 +multiply_and_add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3, int n)
1.1526 +{
1.1527 + while(n&3) {
1.1528 + d1[0] = s1[0] + s2[0]*s3[0];
1.1529 + d1++;
1.1530 + s1++;
1.1531 + s2++;
1.1532 + s3++;
1.1533 + n--;
1.1534 + }
1.1535 + n>>=2;
1.1536 + if (n==0) return;
1.1537 + asm volatile ("\n"
1.1538 + "1:\n"
1.1539 + " movq 0(%2), %%mm0\n"
1.1540 + " pmullw 0(%3), %%mm0\n"
1.1541 + " paddw 0(%1), %%mm0\n"
1.1542 + " movq %%mm0, 0(%0)\n"
1.1543 + " add $8, %0\n"
1.1544 + " add $8, %1\n"
1.1545 + " add $8, %2\n"
1.1546 + " add $8, %3\n"
1.1547 + " decl %4\n"
1.1548 + " jnz 1b\n"
1.1549 + " emms\n"
1.1550 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
1.1551 + );
1.1552 +
1.1553 +}
1.1554 +OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_mmx, multiply_and_add_s16,
1.1555 + OIL_IMPL_FLAG_MMX);
1.1556 +
1.1557 +void
1.1558 +multiply_and_add_s16_u8_mmx(int16_t *d1, int16_t *s1, int16_t *s2,
1.1559 + uint8_t *s3, int n)
1.1560 +{
1.1561 + while(n&3) {
1.1562 + d1[0] = s1[0] + s2[0]*s3[0];
1.1563 + d1++;
1.1564 + s1++;
1.1565 + s2++;
1.1566 + s3++;
1.1567 + n--;
1.1568 + }
1.1569 + n>>=2;
1.1570 + if (n==0) return;
1.1571 + asm volatile ("\n"
1.1572 + " pxor %%mm7, %%mm7\n"
1.1573 + "1:\n"
1.1574 + " movd 0(%3), %%mm0\n"
1.1575 + " punpcklbw %%mm7, %%mm0\n"
1.1576 + " pmullw 0(%2), %%mm0\n"
1.1577 + " paddw 0(%1), %%mm0\n"
1.1578 + " movq %%mm0, 0(%0)\n"
1.1579 + " add $8, %0\n"
1.1580 + " add $8, %1\n"
1.1581 + " add $8, %2\n"
1.1582 + " add $4, %3\n"
1.1583 + " decl %4\n"
1.1584 + " jnz 1b\n"
1.1585 + " emms\n"
1.1586 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
1.1587 + );
1.1588 +
1.1589 +}
1.1590 +OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8,
1.1591 + OIL_IMPL_FLAG_MMX);
1.1592 +
1.1593 +void
1.1594 +multiply_and_add_s16_u8_mmx_2(int16_t *d1, int16_t *s1, int16_t *s2,
1.1595 + uint8_t *s3, int n)
1.1596 +{
1.1597 + while(n&7) {
1.1598 + d1[0] = s1[0] + s2[0]*s3[0];
1.1599 + d1++;
1.1600 + s1++;
1.1601 + s2++;
1.1602 + s3++;
1.1603 + n--;
1.1604 + }
1.1605 + n>>=3;
1.1606 + if (n==0) return;
1.1607 + asm volatile ("\n"
1.1608 + " pxor %%mm7, %%mm7\n"
1.1609 + "1:\n"
1.1610 + " movd 0(%3), %%mm0\n"
1.1611 + " punpcklbw %%mm7, %%mm0\n"
1.1612 + " movd 4(%3), %%mm1\n"
1.1613 + " pmullw 0(%2), %%mm0\n"
1.1614 + " punpcklbw %%mm7, %%mm1\n"
1.1615 + " paddw 0(%1), %%mm0\n"
1.1616 + " pmullw 8(%2), %%mm1\n"
1.1617 + " movq %%mm0, 0(%0)\n"
1.1618 + " paddw 8(%1), %%mm1\n"
1.1619 + " movq %%mm1, 8(%0)\n"
1.1620 +
1.1621 + " add $16, %0\n"
1.1622 + " add $16, %1\n"
1.1623 + " add $16, %2\n"
1.1624 + " add $8, %3\n"
1.1625 + " decl %4\n"
1.1626 + " jnz 1b\n"
1.1627 + " emms\n"
1.1628 + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
1.1629 + );
1.1630 +
1.1631 +}
1.1632 +OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8,
1.1633 + OIL_IMPL_FLAG_MMX);
1.1634 +
1.1635 +void
1.1636 +multiply_and_acc_12xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
1.1637 + int ss1, uint8_t *s2, int ss2, int n)
1.1638 +{
1.1639 + if (n==0) return;
1.1640 + __asm__ __volatile__ ("\n"
1.1641 + " pxor %%mm7, %%mm7\n"
1.1642 + "1:\n"
1.1643 + " movd 0(%2), %%mm0\n"
1.1644 + " punpcklbw %%mm7, %%mm0\n"
1.1645 + " pmullw 0(%1), %%mm0\n"
1.1646 + " paddw 0(%0), %%mm0\n"
1.1647 + " movq %%mm0, 0(%0)\n"
1.1648 + " movd 4(%2), %%mm1\n"
1.1649 + " punpcklbw %%mm7, %%mm1\n"
1.1650 + " pmullw 8(%1), %%mm1\n"
1.1651 + " paddw 8(%0), %%mm1\n"
1.1652 + " movq %%mm1, 8(%0)\n"
1.1653 + " movd 8(%2), %%mm2\n"
1.1654 + " punpcklbw %%mm7, %%mm2\n"
1.1655 + " pmullw 16(%1), %%mm2\n"
1.1656 + " paddw 16(%0), %%mm2\n"
1.1657 + " movq %%mm2, 16(%0)\n"
1.1658 +
1.1659 + " addl %4, %0\n"
1.1660 + " addl %5, %1\n"
1.1661 + " addl %6, %2\n"
1.1662 + " decl %3\n"
1.1663 + " jnz 1b\n"
1.1664 + " emms\n"
1.1665 + : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
1.1666 + : "m" (is1), "m" (ss1), "m" (ss2)
1.1667 + );
1.1668 +}
1.1669 +OIL_DEFINE_IMPL_FULL (multiply_and_acc_12xn_s16_u8_mmx,
1.1670 + multiply_and_acc_12xn_s16_u8, OIL_IMPL_FLAG_MMX);
1.1671 +
1.1672 +#ifdef ENABLE_BROKEN_IMPLS
1.1673 +void
1.1674 +mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
1.1675 + int16_t *s3_4, int16_t *s4_2, int n)
1.1676 +{
1.1677 + int16_t *s2_nx4_off;
1.1678 +
1.1679 + while (n&3) {
1.1680 + int x;
1.1681 + int j;
1.1682 + x = s4_2[0];
1.1683 + for(j=0;j<4;j++){
1.1684 + x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
1.1685 + }
1.1686 + x >>= s4_2[1];
1.1687 + d[0] = s1[0] + x;
1.1688 +
1.1689 + n--;
1.1690 + d++;
1.1691 + s1++;
1.1692 + s2_nx4++;
1.1693 + }
1.1694 + if (n==0) return;
1.1695 +
1.1696 + s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
1.1697 +
1.1698 + n >>= 2;
1.1699 + __asm__ __volatile__ ("\n"
1.1700 + " movq 0(%[s3_4]), %%mm0\n"
1.1701 + " pshufw $0x55, %%mm0, %%mm1\n"
1.1702 + " pshufw $0xaa, %%mm0, %%mm2\n"
1.1703 + " pshufw $0xff, %%mm0, %%mm3\n"
1.1704 + " pshufw $0x00, %%mm0, %%mm0\n"
1.1705 + " movzwl 0(%[s4_2]), %%ecx\n"
1.1706 + " movd %%ecx, %%mm7\n"
1.1707 + " pshufw $0x00, %%mm7, %%mm7\n"
1.1708 + " movzwl 2(%[s4_2]), %%ecx\n"
1.1709 + " movd %%ecx, %%mm6\n"
1.1710 + :
1.1711 + : [s3_4] "r" (s3_4),
1.1712 + [s4_2] "r" (s4_2)
1.1713 + : "ecx"
1.1714 + );
1.1715 +
1.1716 + __asm__ __volatile__ ("\n"
1.1717 + "1:\n"
1.1718 + " movq 0(%[s2_nx4]), %%mm4\n"
1.1719 + " pmullw %%mm0, %%mm4\n"
1.1720 + " movq (%[s2_nx4],%[sstr]), %%mm5\n"
1.1721 + " pmullw %%mm1, %%mm5\n"
1.1722 + " paddsw %%mm5,%%mm4\n"
1.1723 + " movq (%[s2_nx4],%[sstr],2), %%mm5\n"
1.1724 + " pmullw %%mm2, %%mm5\n"
1.1725 + " paddsw %%mm5,%%mm4\n"
1.1726 + " movq (%[s2_nx4_off]), %%mm5\n"
1.1727 + " pmullw %%mm3, %%mm5\n"
1.1728 + " paddsw %%mm5,%%mm4\n"
1.1729 + " paddsw %%mm7, %%mm4\n"
1.1730 + " psraw %%mm6, %%mm4\n"
1.1731 + " paddsw (%[s1]),%%mm4\n"
1.1732 + " movq %%mm4, 0(%[d])\n"
1.1733 +
1.1734 + " addl $8, %[s2_nx4]\n"
1.1735 + " addl $8, %[s2_nx4_off]\n"
1.1736 + " addl $8, %[s1]\n"
1.1737 + " addl $8, %[d]\n"
1.1738 + " decl %[n]\n"
1.1739 + " jnz 1b\n"
1.1740 + " emms\n"
1.1741 + : [s2_nx4] "+r" (s2_nx4),
1.1742 + [d] "+r" (d),
1.1743 + [s2_nx4_off] "+r" (s2_nx4_off),
1.1744 + [n] "+m" (n),
1.1745 + [s1] "+r" (s1)
1.1746 + : [sstr] "r" (sstr2)
1.1747 + );
1.1748 +}
1.1749 +OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
1.1750 + OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
1.1751 +#endif
1.1752 +
1.1753 +void
1.1754 +mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
1.1755 + int16_t *s3_4, int16_t *s4_2, int n)
1.1756 +{
1.1757 + int16_t *s2_nx4_off;
1.1758 +
1.1759 + while (n&3) {
1.1760 + int x;
1.1761 + int j;
1.1762 + x = s4_2[0];
1.1763 + for(j=0;j<4;j++){
1.1764 + x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
1.1765 + }
1.1766 + x >>= s4_2[1];
1.1767 + d[0] = s1[0] + x;
1.1768 +
1.1769 + n--;
1.1770 + d++;
1.1771 + s1++;
1.1772 + s2_nx4++;
1.1773 + }
1.1774 + if (n==0) return;
1.1775 +
1.1776 + s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
1.1777 +
1.1778 + n >>= 2;
1.1779 + __asm__ __volatile__ ("\n"
1.1780 + " movq 0(%[s3_4]), %%mm0\n"
1.1781 + " pxor %%mm5, %%mm5\n"
1.1782 + " movd 0(%[s4_2]), %%mm5\n"
1.1783 + :
1.1784 + : [s3_4] "r" (s3_4),
1.1785 + [s4_2] "r" (s4_2)
1.1786 + );
1.1787 +
1.1788 + __asm__ __volatile__ ("\n"
1.1789 + "1:\n"
1.1790 + " pshufw $0x00, %%mm0, %%mm6\n"
1.1791 + " pmullw 0(%[s2_nx4]), %%mm6\n"
1.1792 + " pshufw $0x00, %%mm0, %%mm3\n"
1.1793 + " pmulhw 0(%[s2_nx4]), %%mm3\n"
1.1794 + " movq %%mm6, %%mm7\n"
1.1795 + " punpcklwd %%mm3, %%mm6\n"
1.1796 + " punpckhwd %%mm3, %%mm7\n"
1.1797 +
1.1798 + " pshufw $0x55, %%mm0, %%mm2\n"
1.1799 + " pmullw 0(%[s2_nx4],%[sstr]), %%mm2\n"
1.1800 + " pshufw $0x55, %%mm0, %%mm3\n"
1.1801 + " pmulhw 0(%[s2_nx4],%[sstr]), %%mm3\n"
1.1802 + " movq %%mm2, %%mm4\n"
1.1803 + " punpcklwd %%mm3, %%mm2\n"
1.1804 + " punpckhwd %%mm3, %%mm4\n"
1.1805 + " paddd %%mm2, %%mm6\n"
1.1806 + " paddd %%mm4, %%mm7\n"
1.1807 +
1.1808 + " pshufw $0xaa, %%mm0, %%mm2\n"
1.1809 + " pmullw 0(%[s2_nx4],%[sstr],2), %%mm2\n"
1.1810 + " pshufw $0xaa, %%mm0, %%mm3\n"
1.1811 + " pmulhw 0(%[s2_nx4],%[sstr],2), %%mm3\n"
1.1812 + " movq %%mm2, %%mm4\n"
1.1813 + " punpcklwd %%mm3, %%mm2\n"
1.1814 + " punpckhwd %%mm3, %%mm4\n"
1.1815 + " paddd %%mm2, %%mm6\n"
1.1816 + " paddd %%mm4, %%mm7\n"
1.1817 +
1.1818 + " pshufw $0xff, %%mm0, %%mm2\n"
1.1819 + " pmullw 0(%[s2_nx4_off]), %%mm2\n"
1.1820 + " pshufw $0xff, %%mm0, %%mm3\n"
1.1821 + " pmulhw 0(%[s2_nx4_off]), %%mm3\n"
1.1822 + " movq %%mm2, %%mm4\n"
1.1823 + " punpcklwd %%mm3, %%mm2\n"
1.1824 + " punpckhwd %%mm3, %%mm4\n"
1.1825 + " paddd %%mm2, %%mm6\n"
1.1826 + " paddd %%mm4, %%mm7\n"
1.1827 +
1.1828 + " pshufw $0xcc, %%mm5, %%mm1\n"
1.1829 + " paddd %%mm1, %%mm6\n"
1.1830 + " paddd %%mm1, %%mm7\n"
1.1831 +
1.1832 + " pshufw $0xfd, %%mm5, %%mm1\n"
1.1833 + " psrad %%mm1, %%mm6\n"
1.1834 + " psrad %%mm1, %%mm7\n"
1.1835 + " packssdw %%mm7, %%mm6\n"
1.1836 +
1.1837 + " paddsw (%[s1]),%%mm6\n"
1.1838 + " movq %%mm6, 0(%[d])\n"
1.1839 +
1.1840 + " addl $8, %[s2_nx4]\n"
1.1841 + " addl $8, %[s2_nx4_off]\n"
1.1842 + " addl $8, %[s1]\n"
1.1843 + " addl $8, %[d]\n"
1.1844 + " decl %[n]\n"
1.1845 + " jnz 1b\n"
1.1846 + " emms\n"
1.1847 + : [s2_nx4] "+r" (s2_nx4),
1.1848 + [d] "+r" (d),
1.1849 + [s2_nx4_off] "+r" (s2_nx4_off),
1.1850 + [n] "+m" (n),
1.1851 + [s1] "+r" (s1)
1.1852 + : [sstr] "r" (sstr2)
1.1853 + );
1.1854 +}
1.1855 +OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
1.1856 + OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
1.1857 +
1.1858 +void
1.1859 +mas8_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx8, int sstr2,
1.1860 + int16_t *s3_8, int16_t *s4_2, int n)
1.1861 +{
1.1862 + int16_t *s2_nx8_off;
1.1863 + void *tmp = NULL;
1.1864 +
1.1865 + while (n&3) {
1.1866 + int x;
1.1867 + int j;
1.1868 + x = s4_2[0];
1.1869 + for(j=0;j<8;j++){
1.1870 + x += OIL_GET(s2_nx8, j*sstr2, int16_t)*s3_8[j];
1.1871 + }
1.1872 + x >>= s4_2[1];
1.1873 + d[0] = s1[0] + x;
1.1874 +
1.1875 + n--;
1.1876 + d++;
1.1877 + s1++;
1.1878 + s2_nx8++;
1.1879 + }
1.1880 + if (n==0) return;
1.1881 +
1.1882 + s2_nx8_off = OIL_OFFSET(s2_nx8, 7*sstr2);
1.1883 +
1.1884 + n >>= 2;
1.1885 + __asm__ __volatile__ ("\n"
1.1886 + " movq 0(%[s3_8]), %%mm0\n"
1.1887 + " pxor %%mm5, %%mm5\n"
1.1888 + " movd 0(%[s4_2]), %%mm5\n"
1.1889 + :
1.1890 + : [s3_8] "r" (s3_8),
1.1891 + [s4_2] "r" (s4_2)
1.1892 + );
1.1893 +
1.1894 + __asm__ __volatile__ ("\n"
1.1895 + "1:\n"
1.1896 + " movl %[s2_nx8], %[tmp]\n"
1.1897 + " movq 0(%[s3_8]), %%mm0\n"
1.1898 +
1.1899 + " pshufw $0x00, %%mm0, %%mm6\n"
1.1900 + " pmullw 0(%[tmp]), %%mm6\n"
1.1901 + " pshufw $0x00, %%mm0, %%mm3\n"
1.1902 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1903 + " movq %%mm6, %%mm7\n"
1.1904 + " punpcklwd %%mm3, %%mm6\n"
1.1905 + " punpckhwd %%mm3, %%mm7\n"
1.1906 +
1.1907 + " addl %[sstr], %[tmp]\n"
1.1908 + " pshufw $0x55, %%mm0, %%mm2\n"
1.1909 + " pmullw 0(%[tmp]), %%mm2\n"
1.1910 + " pshufw $0x55, %%mm0, %%mm3\n"
1.1911 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1912 + " movq %%mm2, %%mm4\n"
1.1913 + " punpcklwd %%mm3, %%mm2\n"
1.1914 + " punpckhwd %%mm3, %%mm4\n"
1.1915 + " paddd %%mm2, %%mm6\n"
1.1916 + " paddd %%mm4, %%mm7\n"
1.1917 +
1.1918 + " addl %[sstr], %[tmp]\n"
1.1919 + " pshufw $0xaa, %%mm0, %%mm2\n"
1.1920 + " pmullw 0(%[tmp]), %%mm2\n"
1.1921 + " pshufw $0xaa, %%mm0, %%mm3\n"
1.1922 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1923 + " movq %%mm2, %%mm4\n"
1.1924 + " punpcklwd %%mm3, %%mm2\n"
1.1925 + " punpckhwd %%mm3, %%mm4\n"
1.1926 + " paddd %%mm2, %%mm6\n"
1.1927 + " paddd %%mm4, %%mm7\n"
1.1928 +
1.1929 + " addl %[sstr], %[tmp]\n"
1.1930 + " pshufw $0xff, %%mm0, %%mm2\n"
1.1931 + " pmullw 0(%[tmp]), %%mm2\n"
1.1932 + " pshufw $0xff, %%mm0, %%mm3\n"
1.1933 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1934 + " movq %%mm2, %%mm4\n"
1.1935 + " punpcklwd %%mm3, %%mm2\n"
1.1936 + " punpckhwd %%mm3, %%mm4\n"
1.1937 + " paddd %%mm2, %%mm6\n"
1.1938 + " paddd %%mm4, %%mm7\n"
1.1939 +
1.1940 + " movq 8(%[s3_8]), %%mm0\n"
1.1941 +
1.1942 + " addl %[sstr], %[tmp]\n"
1.1943 + " pshufw $0x00, %%mm0, %%mm2\n"
1.1944 + " pmullw 0(%[tmp]), %%mm2\n"
1.1945 + " pshufw $0x00, %%mm0, %%mm3\n"
1.1946 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1947 + " movq %%mm2, %%mm4\n"
1.1948 + " punpcklwd %%mm3, %%mm2\n"
1.1949 + " punpckhwd %%mm3, %%mm4\n"
1.1950 + " paddd %%mm2, %%mm6\n"
1.1951 + " paddd %%mm4, %%mm7\n"
1.1952 +
1.1953 + " addl %[sstr], %[tmp]\n"
1.1954 + " pshufw $0x55, %%mm0, %%mm2\n"
1.1955 + " pmullw 0(%[tmp]), %%mm2\n"
1.1956 + " pshufw $0x55, %%mm0, %%mm3\n"
1.1957 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1958 + " movq %%mm2, %%mm4\n"
1.1959 + " punpcklwd %%mm3, %%mm2\n"
1.1960 + " punpckhwd %%mm3, %%mm4\n"
1.1961 + " paddd %%mm2, %%mm6\n"
1.1962 + " paddd %%mm4, %%mm7\n"
1.1963 +
1.1964 + " addl %[sstr], %[tmp]\n"
1.1965 + " pshufw $0xaa, %%mm0, %%mm2\n"
1.1966 + " pmullw 0(%[tmp]), %%mm2\n"
1.1967 + " pshufw $0xaa, %%mm0, %%mm3\n"
1.1968 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1969 + " movq %%mm2, %%mm4\n"
1.1970 + " punpcklwd %%mm3, %%mm2\n"
1.1971 + " punpckhwd %%mm3, %%mm4\n"
1.1972 + " paddd %%mm2, %%mm6\n"
1.1973 + " paddd %%mm4, %%mm7\n"
1.1974 +
1.1975 + " addl %[sstr], %[tmp]\n"
1.1976 + " pshufw $0xff, %%mm0, %%mm2\n"
1.1977 + " pmullw 0(%[tmp]), %%mm2\n"
1.1978 + " pshufw $0xff, %%mm0, %%mm3\n"
1.1979 + " pmulhw 0(%[tmp]), %%mm3\n"
1.1980 + " movq %%mm2, %%mm4\n"
1.1981 + " punpcklwd %%mm3, %%mm2\n"
1.1982 + " punpckhwd %%mm3, %%mm4\n"
1.1983 + " paddd %%mm2, %%mm6\n"
1.1984 + " paddd %%mm4, %%mm7\n"
1.1985 +
1.1986 + " pshufw $0xcc, %%mm5, %%mm1\n"
1.1987 + " paddd %%mm1, %%mm6\n"
1.1988 + " paddd %%mm1, %%mm7\n"
1.1989 +
1.1990 + " pshufw $0xfd, %%mm5, %%mm1\n"
1.1991 + " psrad %%mm1, %%mm6\n"
1.1992 + " psrad %%mm1, %%mm7\n"
1.1993 + " packssdw %%mm7, %%mm6\n"
1.1994 +
1.1995 + " paddsw (%[s1]),%%mm6\n"
1.1996 + " movq %%mm6, 0(%[d])\n"
1.1997 +
1.1998 + " addl $8, %[s2_nx8]\n"
1.1999 + " addl $8, %[s1]\n"
1.2000 + " addl $8, %[d]\n"
1.2001 + " decl %[n]\n"
1.2002 + " jnz 1b\n"
1.2003 + " emms\n"
1.2004 + : [s2_nx8] "+r" (s2_nx8),
1.2005 + [tmp] "+r" (tmp),
1.2006 + [s3_8] "+r" (s3_8),
1.2007 + [d] "+r" (d),
1.2008 + [n] "+m" (n),
1.2009 + [s1] "+r" (s1)
1.2010 + : [sstr] "m" (sstr2)
1.2011 + );
1.2012 +}
1.2013 +OIL_DEFINE_IMPL_FULL (mas8_across_add_s16_mmx, mas8_across_add_s16,
1.2014 + OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
1.2015 +
1.2016 +void
1.2017 +lshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
1.2018 +{
1.2019 + while (n&3) {
1.2020 + d1[0] = s1[0]<<s3_1[0];
1.2021 + d1++;
1.2022 + s1++;
1.2023 + n--;
1.2024 + }
1.2025 + n >>= 2;
1.2026 + __asm__ __volatile__ ("\n"
1.2027 + " movzwl 0(%[s3_1]), %%ecx\n"
1.2028 + " movd %%ecx, %%mm1\n"
1.2029 + "1:\n"
1.2030 + " movq 0(%[s1]), %%mm0\n"
1.2031 + " psllw %%mm1, %%mm0\n"
1.2032 + " movq %%mm0, 0(%[d1])\n"
1.2033 + " add $8, %[d1]\n"
1.2034 + " add $8, %[s1]\n"
1.2035 + " decl %[n]\n"
1.2036 + " jnz 1b\n"
1.2037 + " emms"
1.2038 + : [d1] "+r" (d1),
1.2039 + [s1] "+r" (s1),
1.2040 + [n] "+r" (n)
1.2041 + : [s3_1] "r" (s3_1)
1.2042 + : "ecx");
1.2043 +}
1.2044 +OIL_DEFINE_IMPL_FULL (lshift_s16_mmx, lshift_s16, OIL_IMPL_FLAG_MMX);
1.2045 +
1.2046 +void
1.2047 +lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
1.2048 +{
1.2049 + while (n&7) {
1.2050 + d1[0] = s1[0]<<s3_1[0];
1.2051 + d1++;
1.2052 + s1++;
1.2053 + n--;
1.2054 + }
1.2055 + n >>= 3;
1.2056 + if (n == 0) return;
1.2057 + __asm__ __volatile__ ("\n"
1.2058 + " movzwl 0(%[s3_1]), %%ecx\n"
1.2059 + " movd %%ecx, %%mm1\n"
1.2060 + "1:\n"
1.2061 + " movq 0(%[s1]), %%mm0\n"
1.2062 + " psllw %%mm1, %%mm0\n"
1.2063 + " movq %%mm0, 0(%[d1])\n"
1.2064 + " movq 8(%[s1]), %%mm0\n"
1.2065 + " psllw %%mm1, %%mm0\n"
1.2066 + " movq %%mm0, 8(%[d1])\n"
1.2067 + " add $16, %[d1]\n"
1.2068 + " add $16, %[s1]\n"
1.2069 + " decl %[n]\n"
1.2070 + " jnz 1b\n"
1.2071 + " emms"
1.2072 + : [d1] "+r" (d1),
1.2073 + [s1] "+r" (s1),
1.2074 + [n] "+r" (n)
1.2075 + : [s3_1] "r" (s3_1)
1.2076 + : "ecx");
1.2077 +}
1.2078 +OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
1.2079 +
1.2080 +
1.2081 +
1.2082 +
1.2083 +#ifdef __SYMBIAN32__
1.2084 +
1.2085 +OilFunctionImpl* __oil_function_impl_deinterleave2_mmx, deinterleave2_s16() {
1.2086 + return &_oil_function_impl_deinterleave2_mmx, deinterleave2_s16;
1.2087 +}
1.2088 +#endif
1.2089 +
1.2090 +#ifdef __SYMBIAN32__
1.2091 +
1.2092 +OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16() {
1.2093 + return &_oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16;
1.2094 +}
1.2095 +#endif
1.2096 +
1.2097 +#ifdef __SYMBIAN32__
1.2098 +
1.2099 +OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16() {
1.2100 + return &_oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16;
1.2101 +}
1.2102 +#endif
1.2103 +
1.2104 +#ifdef __SYMBIAN32__
1.2105 +
1.2106 +OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16() {
1.2107 + return &_oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16;
1.2108 +}
1.2109 +#endif
1.2110 +
1.2111 +#ifdef __SYMBIAN32__
1.2112 +
1.2113 +OilFunctionImpl* __oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12() {
1.2114 + return &_oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12;
1.2115 +}
1.2116 +#endif
1.2117 +
1.2118 +#ifdef __SYMBIAN32__
1.2119 +
1.2120 +OilFunctionImpl* __oil_function_impl_interleave2_mmx, interleave2_s16() {
1.2121 + return &_oil_function_impl_interleave2_mmx, interleave2_s16;
1.2122 +}
1.2123 +#endif
1.2124 +
1.2125 +#ifdef __SYMBIAN32__
1.2126 +
1.2127 +OilFunctionImpl* __oil_function_impl_lift_add_shift1_mmx, lift_add_shift1() {
1.2128 + return &_oil_function_impl_lift_add_shift1_mmx, lift_add_shift1;
1.2129 +}
1.2130 +#endif
1.2131 +
1.2132 +#ifdef __SYMBIAN32__
1.2133 +
1.2134 +OilFunctionImpl* __oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1() {
1.2135 + return &_oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1;
1.2136 +}
1.2137 +#endif
1.2138 +
1.2139 +#ifdef __SYMBIAN32__
1.2140 +
1.2141 +OilFunctionImpl* __oil_function_impl_lift_add_shift2_mmx, lift_add_shift2() {
1.2142 + return &_oil_function_impl_lift_add_shift2_mmx, lift_add_shift2;
1.2143 +}
1.2144 +#endif
1.2145 +
1.2146 +#ifdef __SYMBIAN32__
1.2147 +
1.2148 +OilFunctionImpl* __oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2() {
1.2149 + return &_oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2;
1.2150 +}
1.2151 +#endif
1.2152 +
1.2153 +#ifdef __SYMBIAN32__
1.2154 +
1.2155 +OilFunctionImpl* __oil_function_impl_synth_53_mmx, synth_53() {
1.2156 + return &_oil_function_impl_synth_53_mmx, synth_53;
1.2157 +}
1.2158 +#endif
1.2159 +
1.2160 +#ifdef __SYMBIAN32__
1.2161 +
1.2162 +OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
1.2163 + return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
1.2164 +}
1.2165 +#endif
1.2166 +
1.2167 +#ifdef __SYMBIAN32__
1.2168 +
1.2169 +OilFunctionImpl* __oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16() {
1.2170 + return &_oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16;
1.2171 +}
1.2172 +#endif
1.2173 +
1.2174 +#ifdef __SYMBIAN32__
1.2175 +
1.2176 +OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
1.2177 + return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
1.2178 +}
1.2179 +#endif
1.2180 +
1.2181 +#ifdef __SYMBIAN32__
1.2182 +
1.2183 +OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
1.2184 + return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
1.2185 +}
1.2186 +#endif
1.2187 +
1.2188 +#ifdef __SYMBIAN32__
1.2189 +
1.2190 +OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
1.2191 + return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
1.2192 +}
1.2193 +#endif
1.2194 +
1.2195 +#ifdef __SYMBIAN32__
1.2196 +
1.2197 +OilFunctionImpl* __oil_function_impl_mas8_add_s16_mmx, mas8_add_s16() {
1.2198 + return &_oil_function_impl_mas8_add_s16_mmx, mas8_add_s16;
1.2199 +}
1.2200 +#endif
1.2201 +
1.2202 +#ifdef __SYMBIAN32__
1.2203 +
1.2204 +OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16() {
1.2205 + return &_oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16;
1.2206 +}
1.2207 +#endif
1.2208 +
1.2209 +#ifdef __SYMBIAN32__
1.2210 +
1.2211 +OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16() {
1.2212 + return &_oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16;
1.2213 +}
1.2214 +#endif
1.2215 +
1.2216 +#ifdef __SYMBIAN32__
1.2217 +
1.2218 +OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16() {
1.2219 + return &_oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16;
1.2220 +}
1.2221 +#endif
1.2222 +
1.2223 +#ifdef __SYMBIAN32__
1.2224 +
1.2225 +OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16() {
1.2226 + return &_oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16;
1.2227 +}
1.2228 +#endif
1.2229 +
1.2230 +#ifdef __SYMBIAN32__
1.2231 +
1.2232 +OilFunctionImpl* __oil_function_impl_mas8_add_s16_sse2, mas8_add_s16() {
1.2233 + return &_oil_function_impl_mas8_add_s16_sse2, mas8_add_s16;
1.2234 +}
1.2235 +#endif
1.2236 +
1.2237 +#ifdef __SYMBIAN32__
1.2238 +
1.2239 +OilFunctionImpl* __oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16() {
1.2240 + return &_oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16;
1.2241 +}
1.2242 +#endif
1.2243 +
1.2244 +#ifdef __SYMBIAN32__
1.2245 +
1.2246 +OilFunctionImpl* __oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16() {
1.2247 + return &_oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16;
1.2248 +}
1.2249 +#endif
1.2250 +
1.2251 +#ifdef __SYMBIAN32__
1.2252 +
1.2253 +OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16() {
1.2254 + return &_oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16;
1.2255 +}
1.2256 +#endif
1.2257 +
1.2258 +#ifdef __SYMBIAN32__
1.2259 +
1.2260 +OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8() {
1.2261 + return &_oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8;
1.2262 +}
1.2263 +#endif
1.2264 +
1.2265 +#ifdef __SYMBIAN32__
1.2266 +
1.2267 +OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8() {
1.2268 + return &_oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8;
1.2269 +}
1.2270 +#endif
1.2271 +
1.2272 +#ifdef __SYMBIAN32__
1.2273 +
1.2274 +OilFunctionImpl* __oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx() {
1.2275 + return &_oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx;
1.2276 +}
1.2277 +#endif
1.2278 +
1.2279 +#ifdef __SYMBIAN32__
1.2280 +
1.2281 +OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
1.2282 + return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
1.2283 +}
1.2284 +#endif
1.2285 +
1.2286 +#ifdef __SYMBIAN32__
1.2287 +
1.2288 +OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
1.2289 + return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
1.2290 +}
1.2291 +#endif
1.2292 +
1.2293 +#ifdef __SYMBIAN32__
1.2294 +
1.2295 +OilFunctionImpl* __oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16() {
1.2296 + return &_oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16;
1.2297 +}
1.2298 +#endif
1.2299 +
1.2300 +#ifdef __SYMBIAN32__
1.2301 +
1.2302 +OilFunctionImpl* __oil_function_impl_lshift_s16_mmx, lshift_s16() {
1.2303 + return &_oil_function_impl_lshift_s16_mmx, lshift_s16;
1.2304 +}
1.2305 +#endif
1.2306 +
1.2307 +#ifdef __SYMBIAN32__
1.2308 +
1.2309 +OilFunctionImpl* __oil_function_impl_lshift_s16_mmx_2, lshift_s16() {
1.2310 + return &_oil_function_impl_lshift_s16_mmx_2, lshift_s16;
1.2311 +}
1.2312 +#endif
1.2313 +
1.2314 +
1.2315 +
1.2316 +#ifdef __SYMBIAN32__
1.2317 +
1.2318 +OilFunctionImpl* __oil_function_impl_split_53_nomix() {
1.2319 + return &_oil_function_impl_split_53_nomix;
1.2320 +}
1.2321 +#endif
1.2322 +
1.2323 +#ifdef __SYMBIAN32__
1.2324 +
1.2325 +OilFunctionImpl* __oil_function_impl_split_53_c() {
1.2326 + return &_oil_function_impl_split_53_c;
1.2327 +}
1.2328 +#endif
1.2329 +
1.2330 +#ifdef __SYMBIAN32__
1.2331 +
1.2332 +OilFunctionImpl* __oil_function_impl_synth_53_c() {
1.2333 + return &_oil_function_impl_synth_53_c;
1.2334 +}
1.2335 +#endif
1.2336 +
1.2337 +#ifdef __SYMBIAN32__
1.2338 +
1.2339 +OilFunctionImpl* __oil_function_impl_deinterleave2_c_1() {
1.2340 + return &_oil_function_impl_deinterleave2_c_1;
1.2341 +}
1.2342 +#endif
1.2343 +
1.2344 +#ifdef __SYMBIAN32__
1.2345 +
1.2346 +OilFunctionImpl* __oil_function_impl_deinterleave2_asm() {
1.2347 + return &_oil_function_impl_deinterleave2_asm;
1.2348 +}
1.2349 +#endif
1.2350 +