os/ossrv/genericopenlibs/liboil/src/i386/wavelet.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
     2 /*
     3 Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
     4   All rights reserved.
     5   
     6   Redistribution and use in source and binary forms, with or without
     7   modification, are permitted provided that the following conditions
     8   are met:
     9   1. Redistributions of source code must retain the above copyright
    10      notice, this list of conditions and the following disclaimer.
    11   2. Redistributions in binary form must reproduce the above copyright
    12      notice, this list of conditions and the following disclaimer in the
    13      documentation and/or other materials provided with the distribution.
    14   
    15   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
    16   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
    17   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    18   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
    19   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
    20   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    21   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    22   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
    23   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
    24   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    25   POSSIBILITY OF SUCH DAMAGE.
    26 */
    27 
    28 #include <liboil/liboilfunction.h>
    29 #include <liboil/liboilclasses.h>
    30 
    31 
    32 void
    33 split_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
    34 {
    35   int i;
    36     
    37   if (n == 0) return;
    38   /* predict */
    39   for(i=1;i<n*2-2;i+=2){
    40     d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 1);
    41   }
    42   d_2xn[n*2-1] = s_2xn[n*2-1] - s_2xn[n*2-2];
    43 
    44   /* update */
    45   d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
    46   for(i=2;i<n*2;i+=2){
    47     d_2xn[i] = s_2xn[i] + ((d_2xn[i-1] + d_2xn[i+1]) >> 2);
    48   }
    49 } 
    50 OIL_DEFINE_IMPL (split_53_nomix, split_53);
    51   
    52 #if 0
    53 void
    54 synth_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
    55 { 
    56   int i;
    57     
    58   /* predict */ 
    59   i_n[0] -= i_n[1] >> 1;
    60   for(i=2;i<n*2;i+=2){
    61     i_n[i] -= (i_n[i-1] + i_n[i+1]) >> 2;
    62   }
    63   
    64   /* update */
    65   for(i=1;i<n*2-2;i+=2){
    66     i_n[i] += (i_n[i+1] + i_n[i-1]) >> 1;
    67   }
    68   i_n[n*2-1] += i_n[n*2-2];
    69 }
    70 #endif
    71 
    72 
    73 void
    74 split_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
    75 {
    76   int i;
    77 
    78   if (n == 0) return;
    79   if (n == 1) {
    80     d_2xn[1] = s_2xn[1] - s_2xn[0];
    81     d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
    82   } else {
    83     d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
    84     d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
    85     d_2xn+=2;
    86     s_2xn+=2;
    87     for(i=0;i<(n*2-4)/2;i++){
    88       d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
    89       d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
    90       d_2xn+=2;
    91       s_2xn+=2;
    92     }
    93     d_2xn[1] = s_2xn[1] - s_2xn[0];
    94     d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
    95   }
    96 }
    97 OIL_DEFINE_IMPL (split_53_c, split_53);
    98 
    99 void
   100 synth_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
   101 {
   102   int i;
   103 
   104   if (n == 0) return;
   105   if (n == 1) {
   106     d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
   107     d_2xn[1] = s_2xn[1] + d_2xn[0];
   108   } else {
   109     d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
   110     for(i=2;i<n*2-2;i+=2){
   111       d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
   112       d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
   113     }
   114     d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
   115     d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
   116     d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
   117   }
   118 }
   119 OIL_DEFINE_IMPL (synth_53_c, synth_53);
   120 
   121 void
   122 deinterleave2_c_1 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   123 {
   124   int i;
   125 
   126   for(i=0;i<n;i++) {
   127     d1[i] = s_2xn[2*i];
   128     d2[i] = s_2xn[2*i + 1];
   129   }
   130 }
   131 OIL_DEFINE_IMPL (deinterleave2_c_1, deinterleave2_s16);
   132 
   133 void
   134 deinterleave2_asm (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   135 {
   136   if (n == 0) return;
   137 
   138   while (n&1) {
   139     d1[0] = s_2xn[0];
   140     d2[0] = s_2xn[1];
   141     d1++;
   142     d2++;
   143     s_2xn+=2;
   144     n--;
   145   }
   146 
   147   asm volatile ("\n"
   148       "  mov %3, %%ecx\n"
   149       "  sub $2, %%ecx\n"
   150       "1:\n"
   151       "  movw (%1,%%ecx,4), %%ax\n"
   152       "  movw %%ax, (%0,%%ecx,2)\n"
   153       "  movw 2(%1,%%ecx,4), %%ax\n"
   154       "  movw %%ax, (%2,%%ecx,2)\n"
   155       "  movw 4(%1,%%ecx,4), %%ax\n"
   156       "  movw %%ax, 2(%0,%%ecx,2)\n"
   157       "  movw 6(%1,%%ecx,4), %%ax\n"
   158       "  movw %%ax, 2(%2,%%ecx,2)\n"
   159       "  sub $2, %%ecx\n"
   160       "  jge 1b\n"
   161       : "+r" (d1), "+r" (s_2xn), "+r" (d2)
   162       : "m" (n)
   163       : "eax", "ecx");
   164 }
   165 OIL_DEFINE_IMPL (deinterleave2_asm, deinterleave2_s16);
   166 
   167 void
   168 deinterleave2_mmx (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   169 {
   170   while (n&3) {
   171     d1[0] = s_2xn[0];
   172     d2[0] = s_2xn[1];
   173     d1++;
   174     d2++;
   175     s_2xn+=2;
   176     n--;
   177   }
   178   if (n==0) return;
   179 
   180   asm volatile ("\n"
   181       "  xor %%ecx, %%ecx\n"
   182       "1:\n"
   183       "  movq (%1,%%ecx,4), %%mm0\n"
   184       "  movq 8(%1,%%ecx,4), %%mm1\n"
   185       "  pslld $16, %%mm0\n"
   186       "  pslld $16, %%mm1\n"
   187       "  psrad $16, %%mm0\n"
   188       "  psrad $16, %%mm1\n"
   189       "  packssdw %%mm1, %%mm0\n"
   190       "  movq %%mm0, (%0,%%ecx,2)\n"
   191       "  movq (%1,%%ecx,4), %%mm0\n"
   192       "  movq 8(%1,%%ecx,4), %%mm1\n"
   193       "  psrad $16, %%mm0\n"
   194       "  psrad $16, %%mm1\n"
   195       "  packssdw %%mm1, %%mm0\n"
   196       "  movq %%mm0, (%2,%%ecx,2)\n"
   197       "  add $4, %%ecx\n"
   198       "  cmp %3, %%ecx\n"
   199       "  jl 1b\n"
   200       "  emms\n"
   201       : "+r" (d1), "+r" (s_2xn), "+r" (d2)
   202       : "m" (n)
   203       : "eax", "ecx");
   204 }
   205 OIL_DEFINE_IMPL_FULL (deinterleave2_mmx, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
   206 
   207 void
   208 deinterleave2_mmx_2 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   209 {
   210   while (n&3) {
   211     d1[0] = s_2xn[0];
   212     d2[0] = s_2xn[1];
   213     d1++;
   214     d2++;
   215     s_2xn+=2;
   216     n--;
   217   }
   218   if (n==0) return;
   219 
   220   asm volatile ("\n"
   221       "  xor %%ecx, %%ecx\n"
   222       "1:\n"
   223       "  pshufw $0xd8, (%1,%%ecx,4), %%mm0\n"
   224       "  movd %%mm0, (%0,%%ecx,2)\n"
   225       "  pshufw $0x8d, (%1,%%ecx,4), %%mm0\n"
   226       "  movd %%mm0, (%2,%%ecx,2)\n"
   227       "  add $2, %%ecx\n"
   228       "  cmp %3, %%ecx\n"
   229       "  jl 1b\n"
   230       "  emms\n"
   231       : "+r" (d1), "+r" (s_2xn), "+r" (d2)
   232       : "m" (n)
   233       : "eax", "ecx");
   234 }
   235 OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_2, deinterleave2_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
   236 
   237 void
   238 deinterleave2_mmx_3 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   239 {
   240   while (n&3) {
   241     d1[0] = s_2xn[0];
   242     d2[0] = s_2xn[1];
   243     d1++;
   244     d2++;
   245     s_2xn+=2;
   246     n--;
   247   }
   248   if (n==0) return;
   249 
   250   asm volatile ("\n"
   251       "  xor %%ecx, %%ecx\n"
   252       "1:\n"
   253       "  movq (%1,%%ecx,4), %%mm1\n"
   254       "  movq (%1,%%ecx,4), %%mm2\n"
   255       "  movq 8(%1,%%ecx,4), %%mm0\n"
   256       "  punpcklwd %%mm0, %%mm1\n"
   257       "  punpckhwd %%mm0, %%mm2\n"
   258       "  movq %%mm1, %%mm0\n"
   259       "  punpcklwd %%mm2, %%mm0\n"
   260       "  punpckhwd %%mm2, %%mm1\n"
   261       "  movq %%mm0, (%0,%%ecx,2)\n"
   262       "  movq %%mm1, (%2,%%ecx,2)\n"
   263       "  add $4, %%ecx\n"
   264       "  cmp %3, %%ecx\n"
   265       "  jl 1b\n"
   266       "  emms\n"
   267       : "+r" (d1), "+r" (s_2xn), "+r" (d2)
   268       : "m" (n)
   269       : "eax", "ecx");
   270 }
   271 OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_3, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
   272 
   273 void
   274 deinterleave2_mmx_4 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
   275 {
   276   while (n&7) {
   277     d1[0] = s_2xn[0];
   278     d2[0] = s_2xn[1];
   279     d1++;
   280     d2++;
   281     s_2xn+=2;
   282     n--;
   283   }
   284   if (n==0) return;
   285 
   286   asm volatile ("\n"
   287       "  xor %%ecx, %%ecx\n"
   288       "1:\n"
   289       "  movq (%1,%%ecx,4), %%mm1\n"
   290       "  movq %%mm1, %%mm2\n"
   291       "  movq 8(%1,%%ecx,4), %%mm0\n"
   292       "   movq 16(%1,%%ecx,4), %%mm5\n"
   293       "  punpcklwd %%mm0, %%mm1\n"
   294       "   movq %%mm5, %%mm6\n"
   295       "  punpckhwd %%mm0, %%mm2\n"
   296       "   movq 24(%1,%%ecx,4), %%mm4\n"
   297       "  movq %%mm1, %%mm0\n"
   298       "   punpcklwd %%mm4, %%mm5\n"
   299       "  punpcklwd %%mm2, %%mm0\n"
   300       "   punpckhwd %%mm4, %%mm6\n"
   301       "  punpckhwd %%mm2, %%mm1\n"
   302       "   movq %%mm5, %%mm4\n"
   303       "  movq %%mm0, (%0,%%ecx,2)\n"
   304       "   punpcklwd %%mm6, %%mm4\n"
   305       "  movq %%mm1, (%2,%%ecx,2)\n"
   306       "   punpckhwd %%mm6, %%mm5\n"
   307       "   movq %%mm4, 8(%0,%%ecx,2)\n"
   308       "   movq %%mm5, 8(%2,%%ecx,2)\n"
   309       "  add $8, %%ecx\n"
   310       "  cmp %3, %%ecx\n"
   311       "  jl 1b\n"
   312       "  emms\n"
   313       : "+r" (d1), "+r" (s_2xn), "+r" (d2)
   314       : "m" (n)
   315       : "eax", "ecx");
   316 }
   317 OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_4, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
   318 
   319 
   320 void
   321 lift_add_mult_shift12_i386_mmx (int16_t *d, int16_t *s1, int16_t *s2,
   322     int16_t *s3, int16_t *s4, int n)
   323 {
   324   uint32_t val = *s4;
   325 
   326   while (n&3) {
   327     d[0] = s1[0] + ((s4[0]*(s2[0] + s3[0]))>>12);
   328     d++;
   329     s1++;
   330     s2++;
   331     s3++;
   332     n--;
   333   }
   334   if (n==0) return;
   335 
   336   val = ((*(uint16_t *)s4)<<16) | (*(uint16_t *)s4);
   337   n>>=2;
   338   asm volatile ("\n"
   339       "  mov %4, %%ecx\n"
   340       "  movd %%ecx, %%mm7\n"
   341       "  punpcklwd %%mm7, %%mm7\n"
   342       "  mov %5, %%ecx\n"
   343       "1:\n"
   344       "  movq 0(%2), %%mm0\n"
   345       "  paddsw 0(%3), %%mm0\n"
   346       "  movq %%mm0, %%mm1\n"
   347       "  pmullw %%mm7, %%mm0\n"
   348       "  pmulhw %%mm7, %%mm1\n"
   349       "  psrlw $12, %%mm0\n"
   350       "  psllw $4, %%mm1\n"
   351       "  por %%mm1, %%mm0\n"
   352       "  paddsw 0(%1), %%mm0\n"
   353       "  movq %%mm0, 0(%0)\n"
   354       "  add $8, %0\n"
   355       "  add $8, %1\n"
   356       "  add $8, %2\n"
   357       "  add $8, %3\n"
   358       "  decl %%ecx\n"
   359       "  jne 1b\n"
   360       "  emms\n"
   361       : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
   362       : "m" (val), "m" (n)
   363       : "ecx");
   364 }
   365 OIL_DEFINE_IMPL_FULL (lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12, OIL_IMPL_FLAG_MMX);
   366 
   367 void
   368 interleave2_mmx (int16_t *d_2xn, int16_t *s1, int16_t *s2, int n)
   369 {
   370   while (n&3) {
   371     d_2xn[0] = s1[0];
   372     d_2xn[1] = s2[0];
   373     s1++;
   374     s2++;
   375     d_2xn+=2;
   376     n--;
   377   }
   378   if (n==0) return;
   379 
   380   asm volatile ("\n"
   381       "  xor %%ecx, %%ecx\n"
   382       "1:\n"
   383       "  movq (%1,%%ecx,2), %%mm0\n"
   384       "  movq (%2,%%ecx,2), %%mm1\n"
   385       "  movq %%mm0, %%mm2\n"
   386       "  punpckhwd %%mm1, %%mm0\n"
   387       "  punpcklwd %%mm1, %%mm2\n"
   388       "  movq %%mm2, (%0,%%ecx,4)\n"
   389       "  movq %%mm0, 8(%0,%%ecx,4)\n"
   390       "  add $4, %%ecx\n"
   391       "  cmp %3, %%ecx\n"
   392       "  jl 1b\n"
   393       "  emms\n"
   394       : "+r" (d_2xn), "+r" (s1), "+r" (s2)
   395       : "m" (n)
   396       : "eax", "ecx");
   397 }
   398 OIL_DEFINE_IMPL_FULL (interleave2_mmx, interleave2_s16, OIL_IMPL_FLAG_MMX);
   399 
   400 void
   401 lift_add_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
   402 {
   403   while (n&3) {
   404     d[0] = s1[0] + ((s2[0] + s3[0])>>1);
   405     d++;
   406     s1++;
   407     s2++;
   408     s3++;
   409     n--;
   410   }
   411   if (n==0) return;
   412 
   413   asm volatile ("\n"
   414       "  xor %%ecx, %%ecx\n"
   415       "1:\n"
   416       "  movq (%2,%%ecx,2), %%mm1\n"
   417       "  movq (%3,%%ecx,2), %%mm2\n"
   418       "  paddw %%mm2, %%mm1\n"
   419       "  psraw $1, %%mm1\n"
   420       "  paddw (%1,%%ecx,2), %%mm1\n"
   421       "  movq %%mm1, (%0,%%ecx,2)\n"
   422       "  add $4, %%ecx\n"
   423       "  cmp %4, %%ecx\n"
   424       "  jl 1b\n"
   425       "  emms\n"
   426       : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
   427       : "m" (n)
   428       : "ecx");
   429 }
   430 OIL_DEFINE_IMPL_FULL (lift_add_shift1_mmx, lift_add_shift1, OIL_IMPL_FLAG_MMX);
   431 
   432 void
   433 lift_sub_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
   434 {
   435   while (n&3) {
   436     d[0] = s1[0] - ((s2[0] + s3[0])>>1);
   437     d++;
   438     s1++;
   439     s2++;
   440     s3++;
   441     n--;
   442   }
   443   if (n==0) return;
   444 
   445   asm volatile ("\n"
   446       "  xor %%ecx, %%ecx\n"
   447       "1:\n"
   448       "  movq (%2,%%ecx,2), %%mm1\n"
   449       "  movq (%3,%%ecx,2), %%mm2\n"
   450       "  movq (%1,%%ecx,2), %%mm0\n"
   451       "  paddw %%mm2, %%mm1\n"
   452       "  psraw $1, %%mm1\n"
   453       "  psubw %%mm1, %%mm0\n"
   454       "  movq %%mm0, (%0,%%ecx,2)\n"
   455       "  add $4, %%ecx\n"
   456       "  cmp %4, %%ecx\n"
   457       "  jl 1b\n"
   458       "  emms\n"
   459       : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
   460       : "m" (n)
   461       : "ecx");
   462 }
   463 OIL_DEFINE_IMPL_FULL (lift_sub_shift1_mmx, lift_sub_shift1, OIL_IMPL_FLAG_MMX);
   464 
   465 void
   466 lift_add_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
   467 {
   468   while (n&3) {
   469     d[0] = s1[0] + ((s2[0] + s3[0])>>2);
   470     d++;
   471     s1++;
   472     s2++;
   473     s3++;
   474     n--;
   475   }
   476   if (n==0) return;
   477 
   478   asm volatile ("\n"
   479       "  xor %%ecx, %%ecx\n"
   480       "1:\n"
   481       "  movq (%2,%%ecx,2), %%mm1\n"
   482       "  movq (%3,%%ecx,2), %%mm2\n"
   483       "  paddw %%mm2, %%mm1\n"
   484       "  psraw $2, %%mm1\n"
   485       "  paddw (%1,%%ecx,2), %%mm1\n"
   486       "  movq %%mm1, (%0,%%ecx,2)\n"
   487       "  add $4, %%ecx\n"
   488       "  cmp %4, %%ecx\n"
   489       "  jl 1b\n"
   490       "  emms\n"
   491       : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
   492       : "m" (n)
   493       : "ecx");
   494 }
   495 OIL_DEFINE_IMPL_FULL (lift_add_shift2_mmx, lift_add_shift2, OIL_IMPL_FLAG_MMX);
   496 
   497 void
   498 lift_sub_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
   499 {
   500   while (n&3) {
   501     d[0] = s1[0] - ((s2[0] + s3[0])>>2);
   502     d++;
   503     s1++;
   504     s2++;
   505     s3++;
   506     n--;
   507   }
   508   if (n==0) return;
   509 
   510   asm volatile ("\n"
   511       "  xor %%ecx, %%ecx\n"
   512       "1:\n"
   513       "  movq (%2,%%ecx,2), %%mm1\n"
   514       "  movq (%3,%%ecx,2), %%mm2\n"
   515       "  movq (%1,%%ecx,2), %%mm0\n"
   516       "  paddw %%mm2, %%mm1\n"
   517       "  psraw $2, %%mm1\n"
   518       "  psubw %%mm1, %%mm0\n"
   519       "  movq %%mm0, (%0,%%ecx,2)\n"
   520       "  add $4, %%ecx\n"
   521       "  cmp %4, %%ecx\n"
   522       "  jl 1b\n"
   523       "  emms\n"
   524       : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
   525       : "m" (n)
   526       : "ecx");
   527 }
   528 OIL_DEFINE_IMPL_FULL (lift_sub_shift2_mmx, lift_sub_shift2, OIL_IMPL_FLAG_MMX);
   529 
   530 #ifdef ENABLE_BROKEN_IMPLS
   531 void
   532 synth_53_mmx (int16_t *d_2xn, int16_t *s_2xn, int n)
   533 {
   534   int i;
   535     
   536   if (n==0) return;
   537   if (n == 1) {
   538     d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
   539     d_2xn[1] = s_2xn[1] + d_2xn[0]; 
   540   } else {
   541     int i;
   542 
   543     d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
   544 
   545     if (n > 6) {
   546       n-=5;
   547 
   548       asm volatile ("\n"
   549           "  xor %%ecx, %%ecx\n"
   550           "  movw 2(%1), %%ecx\n"
   551           "  movd %%ecx, %%mm7\n"
   552           "  movw 0(%0), %%ecx\n"
   553           "  movd %%ecx, %%mm6\n"
   554           "  movw 0(%1), %%ecx\n"
   555           "  movd %%ecx, %%mm5\n"
   556 
   557           "  xor %%ecx, %%ecx\n"
   558           "1:\n"
   559           "  movq 4(%1,%%ecx,4), %%mm1\n"  // mm1 = s5 s4 s3 s2
   560           "  movq %%mm1, %%mm2\n"          // mm2 = s5 s4 s3 s2
   561           "  movq 12(%1,%%ecx,4), %%mm0\n" // mm0 = s9 s8 s7 s6
   562           "  punpcklwd %%mm0, %%mm1\n"     // mm1 = s7 s3 s6 s2
   563           "  punpckhwd %%mm0, %%mm2\n"     // mm2 = s9 s5 s8 s4
   564           "  movq %%mm1, %%mm0\n"          // mm0 = s7 s3 s6 s2
   565           "  punpcklwd %%mm2, %%mm0\n"     // mm0 = s8 s6 s4 s2
   566           "  punpckhwd %%mm2, %%mm1\n"     // mm1 = s9 s7 s5 s3
   567           //"  movq %%mm0, %%mm3\n"          // mm0 = s8 s6 s4 s2
   568 
   569           "  movq %%mm1, %%mm2\n"          // mm2 = s9 s7 s5 s3
   570           "  psllq $16, %%mm2\n"           // mm2 = s7 s5 s3 00
   571           "  por %%mm7, %%mm2\n"           // mm2 = s7 s5 s3 s1
   572           "  movq %%mm2, %%mm4\n"          // mm4 = s7 s5 s3 s1
   573           "  paddw %%mm1, %%mm2\n"         // mm2 = s9+s7 ...
   574           "  psraw $2, %%mm2\n"            // mm2 = (s9+s7)>>2 ...
   575           "  movq %%mm1, %%mm7\n"          // mm7 = s9 s7 s5 s3
   576           "  psrlq $48, %%mm7\n"           // mm7 = 00 00 00 s9
   577           "  psubw %%mm2, %%mm0\n"         // mm0 = d8 d6 d4 d2
   578 
   579           "  movq %%mm0, %%mm1\n"          // mm1 = d8 d6 d4 d2
   580           "  movq %%mm0, %%mm3\n"          // mm1 = d8 d6 d4 d2
   581           "  psllq $16, %%mm0\n"           // mm0 = d6 d4 d2 00
   582           "  por %%mm6, %%mm0\n"           // mm0 = d6 d4 d2 d0
   583           "  psrlq $48, %%mm1\n"           // mm1 = 00 00 00 d8
   584           "  movq %%mm1, %%mm6\n"          // mm6 = 00 00 00 d8
   585 
   586           "  movq %%mm0, %%mm1\n"
   587           "  paddw %%mm3, %%mm1\n"         // mm0 = d8+d6 ...
   588           "  psraw $1, %%mm1\n"            // mm1 = (d8+d6)>>1 ...
   589           "  paddw %%mm4, %%mm1\n"         // mm1 = d7 d5 d3 d1
   590 
   591           "  movq %%mm1, %%mm2\n"
   592 
   593           "  movq %%mm0, %%mm1\n"
   594           "  punpcklwd %%mm2, %%mm0\n"
   595           "  punpckhwd %%mm2, %%mm1\n"
   596 
   597           "  movq %%mm0, (%0, %%ecx, 4)\n"
   598           "  movq %%mm1, 8(%0, %%ecx, 4)\n"
   599 
   600           "  add $4, %%ecx\n"
   601           "  cmp %3, %%ecx\n"
   602           "  jl 1b\n"
   603           "  emms\n"
   604           : "+r" (d_2xn), "+r" (s_2xn), "+ecx" (i)
   605           : "m" (n));
   606 
   607       i*=2;
   608       n+=5;
   609       d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
   610       i+=2;
   611     } else {
   612       i = 2;
   613     }
   614     for(;i<n*2-2;i+=2){
   615       d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
   616       d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
   617     }
   618     d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
   619     d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
   620     d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
   621   } 
   622 }
   623 OIL_DEFINE_IMPL_FULL (synth_53_mmx, synth_53, OIL_IMPL_FLAG_MMX);
   624 #endif
   625 
   626 
   627 void
   628 mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
   629     int16_t *s4_2, int n)
   630 {
   631   int shift = s4_2[1];
   632 
   633   while (n&3) {
   634     int x;
   635 
   636     x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
   637     x >>= s4_2[1];
   638     d1[0] = s1[0] + x;
   639 
   640     d1++;
   641     s1++;
   642     s2++;
   643     n--;
   644   }
   645   if (n==0) return;
   646 
   647   n>>=2;
   648   asm volatile ("\n"
   649       "  movzwl 0(%0), %%ecx\n"
   650       "  movd %%ecx, %%mm7\n"
   651       "  pshufw $0x00, %%mm7, %%mm7\n"
   652       "  movzwl 2(%0), %%ecx\n"
   653       "  movd %%ecx, %%mm6\n"
   654       "  pshufw $0x00, %%mm6, %%mm6\n"
   655       "  movzwl 0(%1), %%ecx\n"
   656       "  movd %%ecx, %%mm5\n"
   657       "  pshufw $0x44, %%mm5, %%mm5\n"
   658       :: "r" (s3_2), "r" (s4_2)
   659       : "ecx"
   660       );
   661   asm volatile ("\n"
   662       "1:\n"
   663       "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
   664       "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
   665       "  pmullw %%mm7, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
   666       "  pmulhw %%mm7, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
   667       "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
   668       "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
   669       "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
   670       "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
   671 
   672       "  movq 2(%2), %%mm2\n"
   673       "  movq 2(%2), %%mm3\n"
   674       "  pmullw %%mm6, %%mm2\n"
   675       "  pmulhw %%mm6, %%mm3\n"
   676       "  movq %%mm2, %%mm4\n"
   677       "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
   678       "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
   679       "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
   680 
   681       "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
   682       "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
   683 
   684       "  paddd %%mm5, %%mm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
   685       "  paddd %%mm5, %%mm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
   686 
   687       "  movd %4, %%mm4\n"
   688       "  psrad %%mm4, %%mm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
   689       "  psrad %%mm4, %%mm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
   690 
   691       "  packssdw %%mm1, %%mm0\n"
   692       "  paddw 0(%1), %%mm0\n"
   693       "  movq %%mm0, 0(%0)\n"
   694       "  add $8, %0\n"
   695       "  add $8, %1\n"
   696       "  add $8, %2\n"
   697       "  decl %3\n"
   698       "  jnz 1b\n"
   699       "  emms\n"
   700       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
   701       : "r" (shift)
   702       );
   703 }
   704 OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
   705 
   706 #if 0
   707 void
   708 mas2_add_s16_lim_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
   709     int16_t *s4_2, int n)
   710 {
   711   int shift = s4_2[1];
   712 
   713   while (n&3) {
   714     int x;
   715 
   716     x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
   717     x >>= s4_2[1];
   718     d1[0] = s1[0] + x;
   719 
   720     d1++;
   721     s1++;
   722     s2++;
   723     n--;
   724   }
   725   if (n==0) return;
   726 
   727   n>>=2;
   728   asm volatile ("\n"
   729       "  movzwl 0(%0), %%ecx\n"
   730       "  movd %%ecx, %%mm7\n"
   731       "  pshufw $0x00, %%mm7, %%mm7\n"
   732       "  movzwl 2(%0), %%ecx\n"
   733       "  movd %%ecx, %%mm6\n"
   734       "  pshufw $0x00, %%mm6, %%mm6\n"
   735       "  movzwl 0(%1), %%ecx\n"
   736       "  movd %%ecx, %%mm5\n"
   737       "  pshufw $0x44, %%mm5, %%mm5\n"
   738       :: "r" (s3_2), "r" (s4_2)
   739       : "ecx"
   740       );
   741   asm volatile ("\n"
   742       "1:\n"
   743       "  movq 0(%2), %%mm0\n"
   744       "  paddq 2(%2), %%mm0\n"
   745 
   746       "  movd %4, %%mm4\n"
   747       "  psraw %%mm4, %%mm0\n"
   748 
   749       "  paddw 0(%1), %%mm0\n"
   750       "  movq %%mm0, 0(%0)\n"
   751       "  add $8, %0\n"
   752       "  add $8, %1\n"
   753       "  add $8, %2\n"
   754       "  decl %3\n"
   755       "  jnz 1b\n"
   756       "  emms\n"
   757       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
   758       : "r" (shift)
   759       );
   760 }
   761 OIL_DEFINE_IMPL_FULL (mas2_add_s16_lim_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
   762 #endif
   763 
   764 void
   765 mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_4,
   766     int16_t *s4_2, int n)
   767 {
   768   int shift = s4_2[1];
   769   //int m;
   770 
   771   //m = n&3;
   772 #if 1
   773   while (n&3) {
   774     int x;
   775     int i;
   776 
   777     x = s4_2[0];
   778     for(i=0;i<4;i++){
   779       x += s2[i]*s3_4[i];
   780     }
   781     x >>= s4_2[1];
   782     d1[0] = s1[0] + x;
   783 
   784     d1++;
   785     s1++;
   786     s2++;
   787     n--;
   788   }
   789 #endif
   790   if (n==0) return;
   791 
   792   n>>=2;
   793   asm volatile ("\n"
   794       "  movq 0(%0), %%mm7\n"
   795       "  movzwl 0(%1), %%ecx\n"
   796       "  movd %%ecx, %%mm5\n"
   797       "  pshufw $0x44, %%mm5, %%mm5\n"
   798       :: "r" (s3_4), "r" (s4_2)
   799       : "ecx"
   800       );
   801   asm volatile ("\n"
   802       "1:\n"
   803       "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
   804       "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
   805       "  pshufw $0x00, %%mm7, %%mm6\n"
   806       "  pmullw %%mm6, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
   807       "  pmulhw %%mm6, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
   808       "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
   809       "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
   810       "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
   811       "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
   812 
   813       "  movq 2(%2), %%mm2\n"
   814       "  movq 2(%2), %%mm3\n"
   815       "  pshufw $0x55, %%mm7, %%mm6\n"
   816       "  pmullw %%mm6, %%mm2\n"
   817       "  pmulhw %%mm6, %%mm3\n"
   818       "  movq %%mm2, %%mm4\n"
   819       "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
   820       "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
   821       "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
   822       "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
   823       "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
   824 
   825       "  movq 4(%2), %%mm2\n"
   826       "  movq 4(%2), %%mm3\n"
   827       "  pshufw $0xaa, %%mm7, %%mm6\n"
   828       "  pmullw %%mm6, %%mm2\n"
   829       "  pmulhw %%mm6, %%mm3\n"
   830       "  movq %%mm2, %%mm4\n"
   831       "  punpcklwd %%mm3, %%mm2\n"
   832       "  punpckhwd %%mm3, %%mm4\n"
   833       "  movq %%mm4, %%mm3\n"
   834       "  paddd %%mm3, %%mm1\n"
   835       "  paddd %%mm2, %%mm0\n"
   836 
   837       "  movq 6(%2), %%mm2\n"
   838       "  movq 6(%2), %%mm3\n"
   839       "  pshufw $0xff, %%mm7, %%mm6\n"
   840       "  pmullw %%mm6, %%mm2\n"
   841       "  pmulhw %%mm6, %%mm3\n"
   842       "  movq %%mm2, %%mm4\n"
   843       "  punpcklwd %%mm3, %%mm2\n"
   844       "  punpckhwd %%mm3, %%mm4\n"
   845       "  movq %%mm4, %%mm3\n"
   846       "  paddd %%mm3, %%mm1\n"
   847       "  paddd %%mm2, %%mm0\n"
   848 
   849       "  paddd %%mm5, %%mm1\n"
   850       "  paddd %%mm5, %%mm0\n"
   851 
   852       "  movd %4, %%mm4\n"
   853       "  psrad %%mm4, %%mm1\n"
   854       "  psrad %%mm4, %%mm0\n"
   855 
   856       "  packssdw %%mm1, %%mm0\n"
   857       "  paddw 0(%1), %%mm0\n"
   858       "  movq %%mm0, 0(%0)\n"
   859       "  add $8, %0\n"
   860       "  add $8, %1\n"
   861       "  add $8, %2\n"
   862       "  decl %3\n"
   863       "  jnz 1b\n"
   864       "  emms\n"
   865       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
   866       : "r" (shift)
   867       );
   868 #if 0
   869   while (m) {
   870     int x;
   871     int i;
   872 
   873     x = s4_2[0];
   874     for(i=0;i<4;i++){
   875       x += s2[i]*s3_4[i];
   876     }
   877     x >>= s4_2[1];
   878     d1[0] = s1[0] + x;
   879 
   880     d1++;
   881     s1++;
   882     s2++;
   883     m--;
   884   }
   885 #endif
   886 }
   887 OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
   888 
   889 #if 0
   890 /* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
   891 void
   892 mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
   893     int16_t *s4_2, int n)
   894 {
   895   while (n&3) {
   896     int x;
   897 
   898     x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
   899     x >>= s4_2[1];
   900     d1[0] = s1[0] + x;
   901 
   902     d1++;
   903     s1++;
   904     s2++;
   905     n--;
   906   }
   907   if (n==0) return;
   908 
   909   n>>=2;
   910   asm volatile ("\n"
   911       "  movzwl 0(%0), %%ecx\n"
   912       "  movd %%ecx, %%mm7\n"
   913       "  pshufw $0x00, %%mm7, %%mm7\n"
   914       "  movzwl 2(%0), %%ecx\n"
   915       "  movd %%ecx, %%mm6\n"
   916       "  pshufw $0x00, %%mm6, %%mm6\n"
   917       "  movzwl 0(%1), %%ecx\n"
   918       "  movd %%ecx, %%mm5\n"
   919       "  pshufw $0x00, %%mm5, %%mm5\n"
   920       "  movzwl 2(%1), %%ecx\n"
   921       "  movd %%ecx, %%mm4\n"
   922       :: "r" (s3_2), "r" (s4_2)
   923       : "ecx"
   924       );
   925   asm volatile ("\n"
   926       "1:\n"
   927       "  movq 0(%2), %%mm0\n"
   928       "  pmullw %%mm7, %%mm0\n"
   929       "  movq 2(%2), %%mm1\n"
   930       "  pmullw %%mm6, %%mm1\n"
   931       "  paddw %%mm1, %%mm0\n"
   932       "  paddw %%mm5, %%mm0\n"
   933       "  psraw %%mm4, %%mm0\n"
   934       "  paddw 0(%1), %%mm0\n"
   935       "  movq %%mm0, 0(%0)\n"
   936       "  add $8, %0\n"
   937       "  add $8, %1\n"
   938       "  add $8, %2\n"
   939       "  decl %3\n"
   940       "  jnz 1b\n"
   941       "  emms\n"
   942       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
   943       );
   944 }
   945 OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
   946 #endif
   947 
   948 
   949 #if 0
   950 /* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
   951 void
   952 mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
   953     int16_t *s4_2, int n)
   954 {
   955   while (n&3) {
   956     int x;
   957 
   958     x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1] + 
   959       s2[2]*s3_2[2] + s2[2]*s3_2[2];
   960     x >>= s4_2[1];
   961     d1[0] = s1[0] + x;
   962 
   963     d1++;
   964     s1++;
   965     s2++;
   966     n--;
   967   }
   968   if (n==0) return;
   969 
   970   n>>=2;
   971   asm volatile ("\n"
   972       "  movzwl 0(%0), %%ecx\n"
   973       "  movd %%ecx, %%mm7\n"
   974       "  pshufw $0x00, %%mm7, %%mm7\n"
   975       "  movzwl 2(%0), %%ecx\n"
   976       "  movd %%ecx, %%mm6\n"
   977       "  pshufw $0x00, %%mm6, %%mm6\n"
   978       "  movzwl 2(%0), %%ecx\n"
   979       "  movd %%ecx, %%mm5\n"
   980       "  pshufw $0x00, %%mm5, %%mm5\n"
   981       "  movzwl 2(%0), %%ecx\n"
   982       "  movd %%ecx, %%mm4\n"
   983       "  pshufw $0x00, %%mm4, %%mm4\n"
   984       "  movzwl 0(%1), %%ecx\n"
   985       "  movd %%ecx, %%mm3\n"
   986       "  pshufw $0x00, %%mm3, %%mm3\n"
   987       "  movzwl 2(%1), %%ecx\n"
   988       "  movd %%ecx, %%mm2\n"
   989       :: "r" (s3_2), "r" (s4_2)
   990       : "ecx"
   991       );
   992   asm volatile ("\n"
   993       "1:\n"
   994       "  movq 0(%2), %%mm0\n"
   995       "  pmullw %%mm7, %%mm0\n"
   996       "  movq 2(%2), %%mm1\n"
   997       "  pmullw %%mm6, %%mm1\n"
   998       "  paddw %%mm1, %%mm0\n"
   999       "  movq 4(%2), %%mm1\n"
  1000       "  pmullw %%mm5, %%mm1\n"
  1001       "  paddw %%mm1, %%mm0\n"
  1002       "  movq 6(%2), %%mm1\n"
  1003       "  pmullw %%mm4, %%mm1\n"
  1004       "  paddw %%mm1, %%mm0\n"
  1005       "  paddw %%mm3, %%mm0\n"
  1006       "  psraw %%mm2, %%mm0\n"
  1007       "  paddw 0(%1), %%mm0\n"
  1008       "  movq %%mm0, 0(%0)\n"
  1009       "  add $8, %0\n"
  1010       "  add $8, %1\n"
  1011       "  add $8, %2\n"
  1012       "  decl %3\n"
  1013       "  jnz 1b\n"
  1014       "  emms\n"
  1015       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1016       );
  1017 }
  1018 OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1019 #endif
  1020 
  1021 
  1022 #if 0
  1023 /* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
  1024 void
  1025 mas8_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1026     int16_t *s4_2, int n)
  1027 {
  1028   while (n&3) {
  1029     int x;
  1030     int i;
  1031 
  1032     x = s4_2[0];
  1033     for(i=0;i<8;i++){
  1034       x += s2[i]*s3_2[i];
  1035     }
  1036     x >>= s4_2[1];
  1037     d1[0] = s1[0] + x;
  1038 
  1039     d1++;
  1040     s1++;
  1041     s2++;
  1042     n--;
  1043   }
  1044   if (n==0) return;
  1045 
  1046   n>>=2;
  1047   asm volatile ("\n"
  1048       "  movq 0(%0), %%mm6\n"
  1049       "  movq 8(%0), %%mm7\n"
  1050       "  movzwl 0(%1), %%ecx\n"
  1051       "  movd %%ecx, %%mm3\n"
  1052       "  pshufw $0x00, %%mm3, %%mm3\n"
  1053       "  pxor %%mm4, %%mm4\n"
  1054       "  movzwl 2(%1), %%ecx\n"
  1055       "  movd %%ecx, %%mm4\n"
  1056       :: "r" (s3_2), "r" (s4_2)
  1057       : "ecx"
  1058       );
  1059   asm volatile ("\n"
  1060       "1:\n"
  1061       "  pshufw $0x00, %%mm6, %%mm1\n"
  1062       "  movq 0(%2), %%mm0\n"
  1063       "  pmullw %%mm1, %%mm0\n"
  1064       "  pshufw $0x55, %%mm6, %%mm2\n"
  1065       "  movq 2(%2), %%mm1\n"
  1066       "  pmullw %%mm2, %%mm1\n"
  1067       "  paddw %%mm1, %%mm0\n"
  1068       "  pshufw $0xaa, %%mm6, %%mm2\n"
  1069       "  movq 4(%2), %%mm1\n"
  1070       "  pmullw %%mm2, %%mm1\n"
  1071       "  paddw %%mm1, %%mm0\n"
  1072       "  pshufw $0xff, %%mm6, %%mm2\n"
  1073       "  movq 6(%2), %%mm1\n"
  1074       "  pmullw %%mm2, %%mm1\n"
  1075       "  paddw %%mm1, %%mm0\n"
  1076 
  1077       "  pshufw $0x00, %%mm7, %%mm2\n"
  1078       "  movq 8(%2), %%mm1\n"
  1079       "  pmullw %%mm2, %%mm1\n"
  1080       "  paddw %%mm1, %%mm0\n"
  1081       "  pshufw $0x55, %%mm7, %%mm2\n"
  1082       "  movq 10(%2), %%mm1\n"
  1083       "  pmullw %%mm2, %%mm1\n"
  1084       "  paddw %%mm1, %%mm0\n"
  1085       "  pshufw $0xaa, %%mm7, %%mm2\n"
  1086       "  movq 12(%2), %%mm1\n"
  1087       "  pmullw %%mm2, %%mm1\n"
  1088       "  paddw %%mm1, %%mm0\n"
  1089       "  pshufw $0xff, %%mm7, %%mm2\n"
  1090       "  movq 14(%2), %%mm1\n"
  1091       "  pmullw %%mm2, %%mm1\n"
  1092       "  paddw %%mm1, %%mm0\n"
  1093 
  1094       "  paddw %%mm3, %%mm0\n"
  1095       "  psraw %%mm4, %%mm0\n"
  1096       "  paddw 0(%1), %%mm0\n"
  1097       "  movq %%mm0, 0(%0)\n"
  1098       "  add $8, %0\n"
  1099       "  add $8, %1\n"
  1100       "  add $8, %2\n"
  1101       "  decl %3\n"
  1102       "  jnz 1b\n"
  1103       "  emms\n"
  1104       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1105       );
  1106 }
  1107 OIL_DEFINE_IMPL_FULL (mas8_add_s16_mmx, mas8_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1108 #endif
  1109 
  1110 
  1111 void
  1112 mas4_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1113     int16_t *s4_2, int n)
  1114 {
  1115   if (n==0) return;
  1116   asm volatile ("\n"
  1117       "  movq 0(%0), %%mm6\n"
  1118       "  movzwl 0(%1), %%ecx\n"
  1119       "  movd %%ecx, %%mm3\n"
  1120       "  movzwl 2(%1), %%ecx\n"
  1121       "  movd %%ecx, %%mm4\n"
  1122       :: "r" (s3_2), "r" (s4_2)
  1123       : "ecx"
  1124       );
  1125   asm volatile ("\n"
  1126       "1:\n"
  1127       "  movq 0(%2), %%mm0\n"
  1128       "  pmaddwd %%mm6, %%mm0\n"
  1129       "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
  1130       "  paddd %%mm1, %%mm0\n"
  1131       "  paddd %%mm3, %%mm0\n"
  1132       "  psrad %%mm4, %%mm0\n"
  1133       "  movd %%mm0, %%eax\n"
  1134       "  addw 0(%1), %%ax\n"
  1135       "  movw %%ax, 0(%0)\n"
  1136       "  add $2, %0\n"
  1137       "  add $2, %1\n"
  1138       "  add $2, %2\n"
  1139       "  decl %3\n"
  1140       "  jnz 1b\n"
  1141       "  emms\n"
  1142       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1143       :
  1144       : "eax"
  1145       );
  1146 }
  1147 OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1148 
  1149 void
  1150 mas4_add_s16_pmaddwd_2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1151     int16_t *s4_2, int n)
  1152 {
  1153   if (n==0) return;
  1154   asm volatile ("\n"
  1155       "  movq 0(%0), %%mm6\n"
  1156       "  movzwl 0(%1), %%ecx\n"
  1157       "  movd %%ecx, %%mm3\n"
  1158       "  pshufw $0x44, %%mm3, %%mm3\n" // 01 00 01 00
  1159       "  movzwl 2(%1), %%ecx\n"
  1160       "  movd %%ecx, %%mm4\n"
  1161       :: "r" (s3_2), "r" (s4_2)
  1162       : "ecx"
  1163       );
  1164   if (n&1) {
  1165     asm volatile ("\n"
  1166         "  movq 0(%2), %%mm0\n"
  1167         "  pmaddwd %%mm6, %%mm0\n"
  1168         "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
  1169         "  paddd %%mm1, %%mm0\n"
  1170         "  paddd %%mm3, %%mm0\n"
  1171         "  psrad %%mm4, %%mm0\n"
  1172         "  movd %%mm0, %%eax\n"
  1173         "  addw 0(%1), %%ax\n"
  1174         "  movw %%ax, 0(%0)\n"
  1175         "  add $2, %0\n"
  1176         "  add $2, %1\n"
  1177         "  add $2, %2\n"
  1178         "  decl %3\n"
  1179         : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1180         :
  1181         : "eax"
  1182         );
  1183   }
  1184   n>>=1;
  1185   asm volatile ("\n"
  1186       "1:\n"
  1187       "  movq 0(%2), %%mm0\n"
  1188       "  pmaddwd %%mm6, %%mm0\n"
  1189       "  movq 2(%2), %%mm2\n"
  1190       "  pmaddwd %%mm6, %%mm2\n"
  1191 
  1192       "  movq %%mm0, %%mm1\n"
  1193       "  punpckhdq %%mm2, %%mm0\n"
  1194       "  punpckldq %%mm2, %%mm1\n"
  1195 
  1196       "  paddd %%mm1, %%mm0\n"
  1197       "  paddd %%mm3, %%mm0\n"
  1198       "  psrad %%mm4, %%mm0\n"
  1199       "  pshufw $0xd8, %%mm0, %%mm0\n" // 11 01 10 00
  1200 
  1201       "  paddw 0(%1), %%mm0\n"
  1202       "  movd %%mm0, 0(%0)\n"
  1203       "  add $4, %0\n"
  1204       "  add $4, %1\n"
  1205       "  add $4, %2\n"
  1206       "  decl %3\n"
  1207       "  jnz 1b\n"
  1208       "  emms\n"
  1209       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1210       :
  1211       : "eax"
  1212       );
  1213 }
  1214 OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd_2, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1215 
  1216 void
  1217 mas8_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1218     int16_t *s4_2, int n)
  1219 {
  1220   if (n==0) return;
  1221   asm volatile ("\n"
  1222       "  movq 0(%0), %%mm6\n"
  1223       "  movq 8(%0), %%mm7\n"
  1224       "  movzwl 0(%1), %%ecx\n"
  1225       "  movd %%ecx, %%mm3\n"
  1226       "  movzwl 2(%1), %%ecx\n"
  1227       "  movd %%ecx, %%mm4\n"
  1228       :: "r" (s3_2), "r" (s4_2)
  1229       : "ecx"
  1230       );
  1231   asm volatile ("\n"
  1232       "1:\n"
  1233       "  movq 0(%2), %%mm0\n"
  1234       "  pmaddwd %%mm6, %%mm0\n"
  1235       "  movq 8(%2), %%mm1\n"
  1236       "  pmaddwd %%mm7, %%mm1\n"
  1237       "  paddd %%mm1, %%mm0\n"
  1238       "  pshufw $0xee, %%mm0, %%mm1\n"
  1239       "  paddd %%mm1, %%mm0\n"
  1240       "  paddd %%mm3, %%mm0\n"
  1241       "  psrad %%mm4, %%mm0\n"
  1242       "  movd %%mm0, %%eax\n"
  1243       "  addw 0(%1), %%ax\n"
  1244       "  movw %%ax, 0(%0)\n"
  1245       "  add $2, %0\n"
  1246       "  add $2, %1\n"
  1247       "  add $2, %2\n"
  1248       "  decl %3\n"
  1249       "  jnz 1b\n"
  1250       "  emms\n"
  1251       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1252       :
  1253       : "eax"
  1254       );
  1255 }
  1256 OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd, mas8_add_s16, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
  1257 
  1258 
  1259 
  1260 #if 0
  1261 void
  1262 mas8_add_s16_pmaddwd2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1263     int16_t *s4_2, int n)
  1264 {
  1265   while (n&3) {
  1266     int x;
  1267     int i;
  1268 
  1269     x = s4_2[0];
  1270     for(i=0;i<8;i++){
  1271       x += s2[i]*s3_2[i];
  1272     }
  1273     x >>= s4_2[1];
  1274     d1[0] = s1[0] + x;
  1275 
  1276     d1++;
  1277     s1++;
  1278     s2++;
  1279     n--;
  1280   }
  1281   if (n==0) return;
  1282 
  1283   n>>=2;
  1284   asm volatile ("\n"
  1285       "  movq 0(%0), %%mm6\n"
  1286       "  movq 8(%0), %%mm7\n"
  1287       "  movzwl 0(%1), %%ecx\n"
  1288       "  movd %%ecx, %%mm5\n"
  1289       "  pshufw $0x00, %%mm5, %%mm5\n"
  1290       "  pxor %%mm4, %%mm4\n"
  1291       "  movzwl 2(%1), %%ecx\n"
  1292       "  movd %%ecx, %%mm4\n"
  1293       :: "r" (s3_2), "r" (s4_2)
  1294       : "ecx"
  1295       );
  1296   asm volatile ("\n"
  1297       "1:\n"
  1298       "  movq 0(%2), %%mm0\n"
  1299       "  pmaddwd %%mm6, %%mm0\n"
  1300       "  movq 8(%2), %%mm1\n"
  1301       "  pmaddwd %%mm7, %%mm1\n"
  1302       "  paddd %%mm1, %%mm0\n"
  1303       "  pshufw $0xee, %%mm0, %%mm1\n"
  1304       "  paddw %%mm1, %%mm0\n"
  1305 
  1306       "  movq 2(%2), %%mm2\n"
  1307       "  pmaddwd %%mm6, %%mm2\n"
  1308       "  movq 10(%2), %%mm3\n"
  1309       "  pmaddwd %%mm7, %%mm3\n"
  1310       "  paddd %%mm3, %%mm2\n"
  1311       "  pshufw $0xee, %%mm2, %%mm3\n"
  1312       "  paddw %%mm3, %%mm2\n"
  1313       "  pextrw $0, %%mm2, %%eax\n"
  1314       "  pinsrw $1, %%eax, %%mm0\n"
  1315 
  1316       "  movq 4(%2), %%mm2\n"
  1317       "  pmaddwd %%mm6, %%mm2\n"
  1318       "  movq 12(%2), %%mm3\n"
  1319       "  pmaddwd %%mm7, %%mm3\n"
  1320       "  paddd %%mm3, %%mm2\n"
  1321       "  pshufw $0xee, %%mm2, %%mm3\n"
  1322       "  paddw %%mm3, %%mm2\n"
  1323       "  pextrw $0, %%mm2, %%eax\n"
  1324       "  pinsrw $2, %%eax, %%mm0\n"
  1325 
  1326       "  movq 6(%2), %%mm2\n"
  1327       "  pmaddwd %%mm6, %%mm2\n"
  1328       "  movq 14(%2), %%mm3\n"
  1329       "  pmaddwd %%mm7, %%mm3\n"
  1330       "  paddd %%mm3, %%mm2\n"
  1331       "  pshufw $0xee, %%mm2, %%mm3\n"
  1332       "  paddw %%mm3, %%mm2\n"
  1333       "  pextrw $0, %%mm2, %%eax\n"
  1334       "  pinsrw $3, %%eax, %%mm0\n"
  1335 
  1336       "  paddw %%mm5, %%mm0\n"
  1337       "  psraw %%mm4, %%mm0\n"
  1338       "  paddw 0(%1), %%mm0\n"
  1339       "  movq %%mm0, 0(%0)\n"
  1340       "  add $8, %0\n"
  1341       "  add $8, %1\n"
  1342       "  add $8, %2\n"
  1343       "  decl %3\n"
  1344       "  jnz 1b\n"
  1345       "  emms\n"
  1346       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1347       :
  1348       : "eax"
  1349       );
  1350 }
  1351 OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
  1352 #endif
  1353 
  1354 #if 0
  1355 /* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
  1356 void
  1357 mas8_add_s16_sse2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
  1358     int16_t *s4_2, int n)
  1359 {
  1360   asm volatile ("\n"
  1361       "  movq 0(%0), %%mm6\n"
  1362       "  movq 8(%0), %%mm7\n"
  1363       "  movzwl 0(%1), %%ecx\n"
  1364       "  movd %%ecx, %%mm3\n"
  1365       "  pshufw $0x00, %%mm3, %%mm3\n"
  1366       "  pxor %%mm4, %%mm4\n"
  1367       "  movzwl 2(%1), %%ecx\n"
  1368       "  movd %%ecx, %%mm4\n"
  1369       :: "r" (s3_2), "r" (s4_2)
  1370       : "ecx"
  1371       );
  1372   asm volatile ("\n"
  1373       "1:\n"
  1374       "  movq 0(%2), %%mm0\n"
  1375       "  pmullw %%mm6, %%mm0\n"
  1376       "  movq 8(%2), %%mm1\n"
  1377       "  pmullw %%mm7, %%mm1\n"
  1378       "  paddw %%mm1, %%mm0\n"
  1379       "  pshufw $0xee, %%mm0, %%mm1\n"
  1380       "  paddw %%mm1, %%mm0\n"
  1381       "  pshufw $0x01, %%mm0, %%mm1\n"
  1382       "  paddw %%mm1, %%mm0\n"
  1383       "  paddw %%mm3, %%mm0\n"
  1384       "  psraw %%mm4, %%mm0\n"
  1385       "  movd %%mm0, %%eax\n"
  1386       "  addw 0(%1), %%ax\n"
  1387       "  movw %%ax, 0(%0)\n"
  1388       "  add $2, %0\n"
  1389       "  add $2, %1\n"
  1390       "  add $2, %2\n"
  1391       "  decl %3\n"
  1392       "  jnz 1b\n"
  1393       "  emms\n"
  1394       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
  1395       :
  1396       : "eax"
  1397       );
  1398 }
  1399 OIL_DEFINE_IMPL_FULL (mas8_add_s16_sse2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
  1400 #endif
  1401 
  1402 void
  1403 mas2_across_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
  1404     int16_t *s4_2, int16_t *s5_2, int n)
  1405 {
  1406   int shift = s5_2[1];
  1407 
  1408   while (n&3) {
  1409     int x;
  1410 
  1411     x = s5_2[0] + s2[0]*s4_2[0] + s3[0]*s4_2[1];
  1412     x >>= s5_2[1];
  1413     d1[0] = s1[0] + x;
  1414 
  1415     d1++;
  1416     s1++;
  1417     s2++;
  1418     s3++;
  1419     n--;
  1420   }
  1421   if (n==0) return;
  1422 
  1423   n>>=2;
  1424   if (n==0) return;
  1425   asm volatile ("\n"
  1426       "  movzwl 0(%0), %%ecx\n"
  1427       "  movd %%ecx, %%mm7\n"
  1428       "  pshufw $0x00, %%mm7, %%mm7\n"
  1429       "  movzwl 2(%0), %%ecx\n"
  1430       "  movd %%ecx, %%mm6\n"
  1431       "  pshufw $0x00, %%mm6, %%mm6\n"
  1432       "  movzwl 0(%1), %%ecx\n"
  1433       "  movd %%ecx, %%mm5\n"
  1434       "  pshufw $0x44, %%mm5, %%mm5\n"
  1435       :: "r" (s4_2), "r" (s5_2)
  1436       : "ecx"
  1437       );
  1438   asm volatile ("\n"
  1439       "1:\n"
  1440       "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
  1441       "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
  1442       "  pmullw %%mm7, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
  1443       "  pmulhw %%mm7, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
  1444       "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
  1445       "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
  1446       "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
  1447       "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
  1448 
  1449       "  movq 0(%3), %%mm2\n"
  1450       "  movq 0(%3), %%mm3\n"
  1451       "  pmullw %%mm6, %%mm2\n"
  1452       "  pmulhw %%mm6, %%mm3\n"
  1453       "  movq %%mm2, %%mm4\n"
  1454       "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
  1455       "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
  1456       "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
  1457 
  1458       "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
  1459       "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
  1460 
  1461       "  paddd %%mm5, %%mm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
  1462       "  paddd %%mm5, %%mm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
  1463 
  1464       "  movd %5, %%mm4\n"
  1465       "  psrad %%mm4, %%mm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
  1466       "  psrad %%mm4, %%mm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
  1467 
  1468       "  packssdw %%mm1, %%mm0\n"
  1469       "  paddw 0(%1), %%mm0\n"
  1470       "  movq %%mm0, 0(%0)\n"
  1471       "  add $8, %0\n"
  1472       "  add $8, %1\n"
  1473       "  add $8, %2\n"
  1474       "  add $8, %3\n"
  1475       "  decl %4\n"
  1476       "  jnz 1b\n"
  1477       "  emms\n"
  1478       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+m" (n)
  1479       : "r" (shift)
  1480       );
  1481 }
  1482 OIL_DEFINE_IMPL_FULL (mas2_across_add_s16_mmx, mas2_across_add_s16,
  1483     OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1484 
  1485 void
  1486 add_const_rshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2_2, int n)
  1487 {
  1488   while(n&3) {
  1489     d1[0] = (s1[0] + s2_2[0])>>s2_2[1];
  1490     d1++;
  1491     s1++;
  1492     n--;
  1493   }
  1494   n>>=2;
  1495   if (n==0) return;
  1496   asm volatile ("\n"
  1497       "  movzwl 0(%2), %%ecx\n"
  1498       "  movd %%ecx, %%mm7\n"
  1499       "  pshufw $0x00, %%mm7, %%mm7\n"
  1500       "  movzwl 2(%2), %%ecx\n"
  1501       "  movd %%ecx, %%mm6\n"
  1502       "1:\n"
  1503       "  movq 0(%1), %%mm0\n"
  1504       "  paddsw %%mm7, %%mm0\n"
  1505       "  psraw %%mm6, %%mm0\n"
  1506       "  movq %%mm0, 0(%0)\n"
  1507       "  add $8, %0\n"
  1508       "  add $8, %1\n"
  1509       "  decl %3\n"
  1510       "  jnz 1b\n"
  1511       "  emms\n"
  1512       : "+r" (d1), "+r" (s1), "+r" (s2_2), "+r" (n)
  1513       :
  1514       : "ecx"
  1515       );
  1516 
  1517 }
  1518 OIL_DEFINE_IMPL_FULL (add_const_rshift_s16_mmx, add_const_rshift_s16,
  1519     OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
  1520 
  1521 void
  1522 multiply_and_add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3, int n)
  1523 {
  1524   while(n&3) {
  1525     d1[0] = s1[0] + s2[0]*s3[0];
  1526     d1++;
  1527     s1++;
  1528     s2++;
  1529     s3++;
  1530     n--;
  1531   }
  1532   n>>=2;
  1533   if (n==0) return;
  1534   asm volatile ("\n"
  1535       "1:\n"
  1536       "  movq 0(%2), %%mm0\n"
  1537       "  pmullw 0(%3), %%mm0\n"
  1538       "  paddw 0(%1), %%mm0\n"
  1539       "  movq %%mm0, 0(%0)\n"
  1540       "  add $8, %0\n"
  1541       "  add $8, %1\n"
  1542       "  add $8, %2\n"
  1543       "  add $8, %3\n"
  1544       "  decl %4\n"
  1545       "  jnz 1b\n"
  1546       "  emms\n"
  1547       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
  1548       );
  1549 
  1550 }
  1551 OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_mmx, multiply_and_add_s16,
  1552     OIL_IMPL_FLAG_MMX);
  1553 
  1554 void
  1555 multiply_and_add_s16_u8_mmx(int16_t *d1, int16_t *s1, int16_t *s2,
  1556     uint8_t *s3, int n)
  1557 {
  1558   while(n&3) {
  1559     d1[0] = s1[0] + s2[0]*s3[0];
  1560     d1++;
  1561     s1++;
  1562     s2++;
  1563     s3++;
  1564     n--;
  1565   }
  1566   n>>=2;
  1567   if (n==0) return;
  1568   asm volatile ("\n"
  1569       "  pxor %%mm7, %%mm7\n"
  1570       "1:\n"
  1571       "  movd 0(%3), %%mm0\n"
  1572       "  punpcklbw %%mm7, %%mm0\n"
  1573       "  pmullw 0(%2), %%mm0\n"
  1574       "  paddw 0(%1), %%mm0\n"
  1575       "  movq %%mm0, 0(%0)\n"
  1576       "  add $8, %0\n"
  1577       "  add $8, %1\n"
  1578       "  add $8, %2\n"
  1579       "  add $4, %3\n"
  1580       "  decl %4\n"
  1581       "  jnz 1b\n"
  1582       "  emms\n"
  1583       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
  1584       );
  1585 
  1586 }
  1587 OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8,
  1588     OIL_IMPL_FLAG_MMX);
  1589 
  1590 void
  1591 multiply_and_add_s16_u8_mmx_2(int16_t *d1, int16_t *s1, int16_t *s2,
  1592     uint8_t *s3, int n)
  1593 {
  1594   while(n&7) {
  1595     d1[0] = s1[0] + s2[0]*s3[0];
  1596     d1++;
  1597     s1++;
  1598     s2++;
  1599     s3++;
  1600     n--;
  1601   }
  1602   n>>=3;
  1603   if (n==0) return;
  1604   asm volatile ("\n"
  1605       "  pxor %%mm7, %%mm7\n"
  1606       "1:\n"
  1607       "  movd 0(%3), %%mm0\n"
  1608       "  punpcklbw %%mm7, %%mm0\n"
  1609       "   movd 4(%3), %%mm1\n"
  1610       "  pmullw 0(%2), %%mm0\n"
  1611       "   punpcklbw %%mm7, %%mm1\n"
  1612       "  paddw 0(%1), %%mm0\n"
  1613       "   pmullw 8(%2), %%mm1\n"
  1614       "  movq %%mm0, 0(%0)\n"
  1615       "   paddw 8(%1), %%mm1\n"
  1616       "   movq %%mm1, 8(%0)\n"
  1617 
  1618       "  add $16, %0\n"
  1619       "  add $16, %1\n"
  1620       "  add $16, %2\n"
  1621       "  add $8, %3\n"
  1622       "  decl %4\n"
  1623       "  jnz 1b\n"
  1624       "  emms\n"
  1625       : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
  1626       );
  1627 
  1628 }
  1629 OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8,
  1630     OIL_IMPL_FLAG_MMX);
  1631 
  1632 void
  1633 multiply_and_acc_12xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
  1634     int ss1, uint8_t *s2, int ss2, int n)
  1635 {
  1636   if (n==0) return;
  1637   __asm__ __volatile__ ("\n"
  1638       "  pxor %%mm7, %%mm7\n"
  1639       "1:\n"
  1640       "  movd 0(%2), %%mm0\n"
  1641       "  punpcklbw %%mm7, %%mm0\n"
  1642       "  pmullw 0(%1), %%mm0\n"
  1643       "  paddw 0(%0), %%mm0\n"
  1644       "  movq %%mm0, 0(%0)\n"
  1645       "   movd 4(%2), %%mm1\n"
  1646       "   punpcklbw %%mm7, %%mm1\n"
  1647       "   pmullw 8(%1), %%mm1\n"
  1648       "   paddw 8(%0), %%mm1\n"
  1649       "   movq %%mm1, 8(%0)\n"
  1650       "    movd 8(%2), %%mm2\n"
  1651       "    punpcklbw %%mm7, %%mm2\n"
  1652       "    pmullw 16(%1), %%mm2\n"
  1653       "    paddw 16(%0), %%mm2\n"
  1654       "    movq %%mm2, 16(%0)\n"
  1655 
  1656       "  addl %4, %0\n"
  1657       "  addl %5, %1\n"
  1658       "  addl %6, %2\n"
  1659       "  decl %3\n"
  1660       "  jnz 1b\n"
  1661       "  emms\n"
  1662       : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
  1663       : "m" (is1), "m" (ss1), "m" (ss2)
  1664       );
  1665 }
  1666 OIL_DEFINE_IMPL_FULL (multiply_and_acc_12xn_s16_u8_mmx,
  1667     multiply_and_acc_12xn_s16_u8, OIL_IMPL_FLAG_MMX);
  1668 
  1669 #ifdef ENABLE_BROKEN_IMPLS
  1670 void
  1671 mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
  1672     int16_t *s3_4, int16_t *s4_2, int n)
  1673 {
  1674   int16_t *s2_nx4_off;
  1675 
  1676   while (n&3) {
  1677     int x;
  1678     int j;
  1679     x = s4_2[0];
  1680     for(j=0;j<4;j++){
  1681       x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
  1682     }
  1683     x >>= s4_2[1];
  1684     d[0] = s1[0] + x;
  1685 
  1686     n--;
  1687     d++;
  1688     s1++;
  1689     s2_nx4++;
  1690   }
  1691   if (n==0) return;
  1692 
  1693   s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
  1694 
  1695   n >>= 2;
  1696   __asm__ __volatile__ ("\n"
  1697       "  movq 0(%[s3_4]), %%mm0\n"
  1698       "  pshufw $0x55, %%mm0, %%mm1\n"
  1699       "  pshufw $0xaa, %%mm0, %%mm2\n"
  1700       "  pshufw $0xff, %%mm0, %%mm3\n"
  1701       "  pshufw $0x00, %%mm0, %%mm0\n"
  1702       "  movzwl 0(%[s4_2]), %%ecx\n"
  1703       "  movd %%ecx, %%mm7\n"
  1704       "  pshufw $0x00, %%mm7, %%mm7\n"
  1705       "  movzwl 2(%[s4_2]), %%ecx\n"
  1706       "  movd %%ecx, %%mm6\n"
  1707       :
  1708       : [s3_4] "r" (s3_4),
  1709         [s4_2] "r" (s4_2)
  1710       : "ecx"
  1711       );
  1712 
  1713   __asm__ __volatile__ ("\n"
  1714       "1:\n"
  1715       "  movq 0(%[s2_nx4]), %%mm4\n"
  1716       "  pmullw %%mm0, %%mm4\n"
  1717       "  movq (%[s2_nx4],%[sstr]), %%mm5\n"
  1718       "  pmullw %%mm1, %%mm5\n"
  1719       "  paddsw %%mm5,%%mm4\n"
  1720       "  movq (%[s2_nx4],%[sstr],2), %%mm5\n"
  1721       "  pmullw %%mm2, %%mm5\n"
  1722       "  paddsw %%mm5,%%mm4\n"
  1723       "  movq (%[s2_nx4_off]), %%mm5\n"
  1724       "  pmullw %%mm3, %%mm5\n"
  1725       "  paddsw %%mm5,%%mm4\n"
  1726       "  paddsw %%mm7, %%mm4\n"
  1727       "  psraw %%mm6, %%mm4\n"
  1728       "  paddsw (%[s1]),%%mm4\n"
  1729       "  movq %%mm4, 0(%[d])\n"
  1730 
  1731       "  addl $8, %[s2_nx4]\n"
  1732       "  addl $8, %[s2_nx4_off]\n"
  1733       "  addl $8, %[s1]\n"
  1734       "  addl $8, %[d]\n"
  1735       "  decl %[n]\n"
  1736       "  jnz 1b\n"
  1737       "  emms\n"
  1738       : [s2_nx4] "+r" (s2_nx4),
  1739         [d] "+r" (d),
  1740         [s2_nx4_off] "+r" (s2_nx4_off),
  1741         [n] "+m" (n),
  1742         [s1] "+r" (s1)
  1743       : [sstr] "r" (sstr2)
  1744       );
  1745 }
  1746 OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
  1747     OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
  1748 #endif
  1749 
  1750 void
  1751 mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
  1752     int16_t *s3_4, int16_t *s4_2, int n)
  1753 {
  1754   int16_t *s2_nx4_off;
  1755 
  1756   while (n&3) {
  1757     int x;
  1758     int j;
  1759     x = s4_2[0];
  1760     for(j=0;j<4;j++){
  1761       x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
  1762     }
  1763     x >>= s4_2[1];
  1764     d[0] = s1[0] + x;
  1765 
  1766     n--;
  1767     d++;
  1768     s1++;
  1769     s2_nx4++;
  1770   }
  1771   if (n==0) return;
  1772 
  1773   s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
  1774 
  1775   n >>= 2;
  1776   __asm__ __volatile__ ("\n"
  1777       "  movq 0(%[s3_4]), %%mm0\n"
  1778       "  pxor %%mm5, %%mm5\n"
  1779       "  movd 0(%[s4_2]), %%mm5\n"
  1780       :
  1781       : [s3_4] "r" (s3_4),
  1782         [s4_2] "r" (s4_2)
  1783       );
  1784 
  1785   __asm__ __volatile__ ("\n"
  1786       "1:\n"
  1787       "  pshufw $0x00, %%mm0, %%mm6\n"
  1788       "  pmullw 0(%[s2_nx4]), %%mm6\n"
  1789       "  pshufw $0x00, %%mm0, %%mm3\n"
  1790       "  pmulhw 0(%[s2_nx4]), %%mm3\n"
  1791       "  movq %%mm6, %%mm7\n"
  1792       "  punpcklwd %%mm3, %%mm6\n"
  1793       "  punpckhwd %%mm3, %%mm7\n"
  1794 
  1795       "  pshufw $0x55, %%mm0, %%mm2\n"
  1796       "  pmullw 0(%[s2_nx4],%[sstr]), %%mm2\n"
  1797       "  pshufw $0x55, %%mm0, %%mm3\n"
  1798       "  pmulhw 0(%[s2_nx4],%[sstr]), %%mm3\n"
  1799       "  movq %%mm2, %%mm4\n"
  1800       "  punpcklwd %%mm3, %%mm2\n"
  1801       "  punpckhwd %%mm3, %%mm4\n"
  1802       "  paddd %%mm2, %%mm6\n"
  1803       "  paddd %%mm4, %%mm7\n"
  1804 
  1805       "  pshufw $0xaa, %%mm0, %%mm2\n"
  1806       "  pmullw 0(%[s2_nx4],%[sstr],2), %%mm2\n"
  1807       "  pshufw $0xaa, %%mm0, %%mm3\n"
  1808       "  pmulhw 0(%[s2_nx4],%[sstr],2), %%mm3\n"
  1809       "  movq %%mm2, %%mm4\n"
  1810       "  punpcklwd %%mm3, %%mm2\n"
  1811       "  punpckhwd %%mm3, %%mm4\n"
  1812       "  paddd %%mm2, %%mm6\n"
  1813       "  paddd %%mm4, %%mm7\n"
  1814 
  1815       "  pshufw $0xff, %%mm0, %%mm2\n"
  1816       "  pmullw 0(%[s2_nx4_off]), %%mm2\n"
  1817       "  pshufw $0xff, %%mm0, %%mm3\n"
  1818       "  pmulhw 0(%[s2_nx4_off]), %%mm3\n"
  1819       "  movq %%mm2, %%mm4\n"
  1820       "  punpcklwd %%mm3, %%mm2\n"
  1821       "  punpckhwd %%mm3, %%mm4\n"
  1822       "  paddd %%mm2, %%mm6\n"
  1823       "  paddd %%mm4, %%mm7\n"
  1824 
  1825       "  pshufw $0xcc, %%mm5, %%mm1\n"
  1826       "  paddd %%mm1, %%mm6\n"
  1827       "  paddd %%mm1, %%mm7\n"
  1828 
  1829       "  pshufw $0xfd, %%mm5, %%mm1\n"
  1830       "  psrad %%mm1, %%mm6\n"
  1831       "  psrad %%mm1, %%mm7\n"
  1832       "  packssdw %%mm7, %%mm6\n"
  1833 
  1834       "  paddsw (%[s1]),%%mm6\n"
  1835       "  movq %%mm6, 0(%[d])\n"
  1836 
  1837       "  addl $8, %[s2_nx4]\n"
  1838       "  addl $8, %[s2_nx4_off]\n"
  1839       "  addl $8, %[s1]\n"
  1840       "  addl $8, %[d]\n"
  1841       "  decl %[n]\n"
  1842       "  jnz 1b\n"
  1843       "  emms\n"
  1844       : [s2_nx4] "+r" (s2_nx4),
  1845         [d] "+r" (d),
  1846         [s2_nx4_off] "+r" (s2_nx4_off),
  1847         [n] "+m" (n),
  1848         [s1] "+r" (s1)
  1849       : [sstr] "r" (sstr2)
  1850       );
  1851 }
  1852 OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
  1853     OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
  1854 
  1855 void
  1856 mas8_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx8, int sstr2,
  1857     int16_t *s3_8, int16_t *s4_2, int n)
  1858 {
  1859   int16_t *s2_nx8_off;
  1860   void *tmp = NULL;
  1861 
  1862   while (n&3) {
  1863     int x;
  1864     int j;
  1865     x = s4_2[0];
  1866     for(j=0;j<8;j++){
  1867       x += OIL_GET(s2_nx8, j*sstr2, int16_t)*s3_8[j];
  1868     }
  1869     x >>= s4_2[1];
  1870     d[0] = s1[0] + x;
  1871 
  1872     n--;
  1873     d++;
  1874     s1++;
  1875     s2_nx8++;
  1876   }
  1877   if (n==0) return;
  1878 
  1879   s2_nx8_off = OIL_OFFSET(s2_nx8, 7*sstr2);
  1880 
  1881   n >>= 2;
  1882   __asm__ __volatile__ ("\n"
  1883       "  movq 0(%[s3_8]), %%mm0\n"
  1884       "  pxor %%mm5, %%mm5\n"
  1885       "  movd 0(%[s4_2]), %%mm5\n"
  1886       :
  1887       : [s3_8] "r" (s3_8),
  1888         [s4_2] "r" (s4_2)
  1889       );
  1890 
  1891   __asm__ __volatile__ ("\n"
  1892       "1:\n"
  1893       "  movl %[s2_nx8], %[tmp]\n"
  1894       "  movq 0(%[s3_8]), %%mm0\n"
  1895 
  1896       "  pshufw $0x00, %%mm0, %%mm6\n"
  1897       "  pmullw 0(%[tmp]), %%mm6\n"
  1898       "  pshufw $0x00, %%mm0, %%mm3\n"
  1899       "  pmulhw 0(%[tmp]), %%mm3\n"
  1900       "  movq %%mm6, %%mm7\n"
  1901       "  punpcklwd %%mm3, %%mm6\n"
  1902       "  punpckhwd %%mm3, %%mm7\n"
  1903 
  1904       "  addl %[sstr], %[tmp]\n"
  1905       "  pshufw $0x55, %%mm0, %%mm2\n"
  1906       "  pmullw 0(%[tmp]), %%mm2\n"
  1907       "  pshufw $0x55, %%mm0, %%mm3\n"
  1908       "  pmulhw 0(%[tmp]), %%mm3\n"
  1909       "  movq %%mm2, %%mm4\n"
  1910       "  punpcklwd %%mm3, %%mm2\n"
  1911       "  punpckhwd %%mm3, %%mm4\n"
  1912       "  paddd %%mm2, %%mm6\n"
  1913       "  paddd %%mm4, %%mm7\n"
  1914 
  1915       "  addl %[sstr], %[tmp]\n"
  1916       "  pshufw $0xaa, %%mm0, %%mm2\n"
  1917       "  pmullw 0(%[tmp]), %%mm2\n"
  1918       "  pshufw $0xaa, %%mm0, %%mm3\n"
  1919       "  pmulhw 0(%[tmp]), %%mm3\n"
  1920       "  movq %%mm2, %%mm4\n"
  1921       "  punpcklwd %%mm3, %%mm2\n"
  1922       "  punpckhwd %%mm3, %%mm4\n"
  1923       "  paddd %%mm2, %%mm6\n"
  1924       "  paddd %%mm4, %%mm7\n"
  1925 
  1926       "  addl %[sstr], %[tmp]\n"
  1927       "  pshufw $0xff, %%mm0, %%mm2\n"
  1928       "  pmullw 0(%[tmp]), %%mm2\n"
  1929       "  pshufw $0xff, %%mm0, %%mm3\n"
  1930       "  pmulhw 0(%[tmp]), %%mm3\n"
  1931       "  movq %%mm2, %%mm4\n"
  1932       "  punpcklwd %%mm3, %%mm2\n"
  1933       "  punpckhwd %%mm3, %%mm4\n"
  1934       "  paddd %%mm2, %%mm6\n"
  1935       "  paddd %%mm4, %%mm7\n"
  1936 
  1937       "  movq 8(%[s3_8]), %%mm0\n"
  1938 
  1939       "  addl %[sstr], %[tmp]\n"
  1940       "  pshufw $0x00, %%mm0, %%mm2\n"
  1941       "  pmullw 0(%[tmp]), %%mm2\n"
  1942       "  pshufw $0x00, %%mm0, %%mm3\n"
  1943       "  pmulhw 0(%[tmp]), %%mm3\n"
  1944       "  movq %%mm2, %%mm4\n"
  1945       "  punpcklwd %%mm3, %%mm2\n"
  1946       "  punpckhwd %%mm3, %%mm4\n"
  1947       "  paddd %%mm2, %%mm6\n"
  1948       "  paddd %%mm4, %%mm7\n"
  1949 
  1950       "  addl %[sstr], %[tmp]\n"
  1951       "  pshufw $0x55, %%mm0, %%mm2\n"
  1952       "  pmullw 0(%[tmp]), %%mm2\n"
  1953       "  pshufw $0x55, %%mm0, %%mm3\n"
  1954       "  pmulhw 0(%[tmp]), %%mm3\n"
  1955       "  movq %%mm2, %%mm4\n"
  1956       "  punpcklwd %%mm3, %%mm2\n"
  1957       "  punpckhwd %%mm3, %%mm4\n"
  1958       "  paddd %%mm2, %%mm6\n"
  1959       "  paddd %%mm4, %%mm7\n"
  1960 
  1961       "  addl %[sstr], %[tmp]\n"
  1962       "  pshufw $0xaa, %%mm0, %%mm2\n"
  1963       "  pmullw 0(%[tmp]), %%mm2\n"
  1964       "  pshufw $0xaa, %%mm0, %%mm3\n"
  1965       "  pmulhw 0(%[tmp]), %%mm3\n"
  1966       "  movq %%mm2, %%mm4\n"
  1967       "  punpcklwd %%mm3, %%mm2\n"
  1968       "  punpckhwd %%mm3, %%mm4\n"
  1969       "  paddd %%mm2, %%mm6\n"
  1970       "  paddd %%mm4, %%mm7\n"
  1971 
  1972       "  addl %[sstr], %[tmp]\n"
  1973       "  pshufw $0xff, %%mm0, %%mm2\n"
  1974       "  pmullw 0(%[tmp]), %%mm2\n"
  1975       "  pshufw $0xff, %%mm0, %%mm3\n"
  1976       "  pmulhw 0(%[tmp]), %%mm3\n"
  1977       "  movq %%mm2, %%mm4\n"
  1978       "  punpcklwd %%mm3, %%mm2\n"
  1979       "  punpckhwd %%mm3, %%mm4\n"
  1980       "  paddd %%mm2, %%mm6\n"
  1981       "  paddd %%mm4, %%mm7\n"
  1982 
  1983       "  pshufw $0xcc, %%mm5, %%mm1\n"
  1984       "  paddd %%mm1, %%mm6\n"
  1985       "  paddd %%mm1, %%mm7\n"
  1986 
  1987       "  pshufw $0xfd, %%mm5, %%mm1\n"
  1988       "  psrad %%mm1, %%mm6\n"
  1989       "  psrad %%mm1, %%mm7\n"
  1990       "  packssdw %%mm7, %%mm6\n"
  1991 
  1992       "  paddsw (%[s1]),%%mm6\n"
  1993       "  movq %%mm6, 0(%[d])\n"
  1994 
  1995       "  addl $8, %[s2_nx8]\n"
  1996       "  addl $8, %[s1]\n"
  1997       "  addl $8, %[d]\n"
  1998       "  decl %[n]\n"
  1999       "  jnz 1b\n"
  2000       "  emms\n"
  2001       : [s2_nx8] "+r" (s2_nx8),
  2002         [tmp] "+r" (tmp),
  2003         [s3_8] "+r" (s3_8),
  2004         [d] "+r" (d),
  2005         [n] "+m" (n),
  2006         [s1] "+r" (s1)
  2007       : [sstr] "m" (sstr2)
  2008       );
  2009 }
  2010 OIL_DEFINE_IMPL_FULL (mas8_across_add_s16_mmx, mas8_across_add_s16,
  2011     OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
  2012 
  2013 void
  2014 lshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
  2015 {
  2016   while (n&3) {
  2017     d1[0] = s1[0]<<s3_1[0];
  2018     d1++;
  2019     s1++;
  2020     n--;
  2021   }
  2022   n >>= 2;
  2023   __asm__ __volatile__ ("\n"
  2024       "  movzwl 0(%[s3_1]), %%ecx\n"
  2025       "  movd %%ecx, %%mm1\n"
  2026       "1:\n"
  2027       "  movq 0(%[s1]), %%mm0\n"
  2028       "  psllw %%mm1, %%mm0\n"
  2029       "  movq %%mm0, 0(%[d1])\n"
  2030       "  add $8, %[d1]\n"
  2031       "  add $8, %[s1]\n"
  2032       "  decl %[n]\n"
  2033       "  jnz 1b\n"
  2034       "  emms"
  2035       : [d1] "+r" (d1),
  2036         [s1] "+r" (s1),
  2037         [n] "+r" (n)
  2038       : [s3_1] "r" (s3_1)
  2039       : "ecx");
  2040 }
  2041 OIL_DEFINE_IMPL_FULL (lshift_s16_mmx, lshift_s16, OIL_IMPL_FLAG_MMX);
  2042 
  2043 void
  2044 lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
  2045 {
  2046   while (n&7) {
  2047     d1[0] = s1[0]<<s3_1[0];
  2048     d1++;
  2049     s1++;
  2050     n--;
  2051   }
  2052   n >>= 3;
  2053   if (n == 0) return;
  2054   __asm__ __volatile__ ("\n"
  2055       "  movzwl 0(%[s3_1]), %%ecx\n"
  2056       "  movd %%ecx, %%mm1\n"
  2057       "1:\n"
  2058       "  movq 0(%[s1]), %%mm0\n"
  2059       "  psllw %%mm1, %%mm0\n"
  2060       "  movq %%mm0, 0(%[d1])\n"
  2061       "  movq 8(%[s1]), %%mm0\n"
  2062       "  psllw %%mm1, %%mm0\n"
  2063       "  movq %%mm0, 8(%[d1])\n"
  2064       "  add $16, %[d1]\n"
  2065       "  add $16, %[s1]\n"
  2066       "  decl %[n]\n"
  2067       "  jnz 1b\n"
  2068       "  emms"
  2069       : [d1] "+r" (d1),
  2070         [s1] "+r" (s1),
  2071         [n] "+r" (n)
  2072       : [s3_1] "r" (s3_1)
  2073       : "ecx");
  2074 }
  2075 OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
  2076 
  2077 
  2078 
  2079 
  2080 #ifdef	__SYMBIAN32__
  2081  
  2082 OilFunctionImpl* __oil_function_impl_deinterleave2_mmx, deinterleave2_s16() {
  2083 		return &_oil_function_impl_deinterleave2_mmx, deinterleave2_s16;
  2084 }
  2085 #endif
  2086 
  2087 #ifdef	__SYMBIAN32__
  2088  
  2089 OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16() {
  2090 		return &_oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16;
  2091 }
  2092 #endif
  2093 
  2094 #ifdef	__SYMBIAN32__
  2095  
  2096 OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16() {
  2097 		return &_oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16;
  2098 }
  2099 #endif
  2100 
  2101 #ifdef	__SYMBIAN32__
  2102  
  2103 OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16() {
  2104 		return &_oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16;
  2105 }
  2106 #endif
  2107 
  2108 #ifdef	__SYMBIAN32__
  2109  
  2110 OilFunctionImpl* __oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12() {
  2111 		return &_oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12;
  2112 }
  2113 #endif
  2114 
  2115 #ifdef	__SYMBIAN32__
  2116  
  2117 OilFunctionImpl* __oil_function_impl_interleave2_mmx, interleave2_s16() {
  2118 		return &_oil_function_impl_interleave2_mmx, interleave2_s16;
  2119 }
  2120 #endif
  2121 
  2122 #ifdef	__SYMBIAN32__
  2123  
  2124 OilFunctionImpl* __oil_function_impl_lift_add_shift1_mmx, lift_add_shift1() {
  2125 		return &_oil_function_impl_lift_add_shift1_mmx, lift_add_shift1;
  2126 }
  2127 #endif
  2128 
  2129 #ifdef	__SYMBIAN32__
  2130  
  2131 OilFunctionImpl* __oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1() {
  2132 		return &_oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1;
  2133 }
  2134 #endif
  2135 
  2136 #ifdef	__SYMBIAN32__
  2137  
  2138 OilFunctionImpl* __oil_function_impl_lift_add_shift2_mmx, lift_add_shift2() {
  2139 		return &_oil_function_impl_lift_add_shift2_mmx, lift_add_shift2;
  2140 }
  2141 #endif
  2142 
  2143 #ifdef	__SYMBIAN32__
  2144  
  2145 OilFunctionImpl* __oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2() {
  2146 		return &_oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2;
  2147 }
  2148 #endif
  2149 
  2150 #ifdef	__SYMBIAN32__
  2151  
  2152 OilFunctionImpl* __oil_function_impl_synth_53_mmx, synth_53() {
  2153 		return &_oil_function_impl_synth_53_mmx, synth_53;
  2154 }
  2155 #endif
  2156 
  2157 #ifdef	__SYMBIAN32__
  2158  
  2159 OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
  2160 		return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
  2161 }
  2162 #endif
  2163 
  2164 #ifdef	__SYMBIAN32__
  2165  
  2166 OilFunctionImpl* __oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16() {
  2167 		return &_oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16;
  2168 }
  2169 #endif
  2170 
  2171 #ifdef	__SYMBIAN32__
  2172  
  2173 OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
  2174 		return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
  2175 }
  2176 #endif
  2177 
  2178 #ifdef	__SYMBIAN32__
  2179  
  2180 OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
  2181 		return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
  2182 }
  2183 #endif
  2184 
  2185 #ifdef	__SYMBIAN32__
  2186  
  2187 OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
  2188 		return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
  2189 }
  2190 #endif
  2191 
  2192 #ifdef	__SYMBIAN32__
  2193  
  2194 OilFunctionImpl* __oil_function_impl_mas8_add_s16_mmx, mas8_add_s16() {
  2195 		return &_oil_function_impl_mas8_add_s16_mmx, mas8_add_s16;
  2196 }
  2197 #endif
  2198 
  2199 #ifdef	__SYMBIAN32__
  2200  
  2201 OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16() {
  2202 		return &_oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16;
  2203 }
  2204 #endif
  2205 
  2206 #ifdef	__SYMBIAN32__
  2207  
  2208 OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16() {
  2209 		return &_oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16;
  2210 }
  2211 #endif
  2212 
  2213 #ifdef	__SYMBIAN32__
  2214  
  2215 OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16() {
  2216 		return &_oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16;
  2217 }
  2218 #endif
  2219 
  2220 #ifdef	__SYMBIAN32__
  2221  
  2222 OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16() {
  2223 		return &_oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16;
  2224 }
  2225 #endif
  2226 
  2227 #ifdef	__SYMBIAN32__
  2228  
  2229 OilFunctionImpl* __oil_function_impl_mas8_add_s16_sse2, mas8_add_s16() {
  2230 		return &_oil_function_impl_mas8_add_s16_sse2, mas8_add_s16;
  2231 }
  2232 #endif
  2233 
  2234 #ifdef	__SYMBIAN32__
  2235  
  2236 OilFunctionImpl* __oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16() {
  2237 		return &_oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16;
  2238 }
  2239 #endif
  2240 
  2241 #ifdef	__SYMBIAN32__
  2242  
  2243 OilFunctionImpl* __oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16() {
  2244 		return &_oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16;
  2245 }
  2246 #endif
  2247 
  2248 #ifdef	__SYMBIAN32__
  2249  
  2250 OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16() {
  2251 		return &_oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16;
  2252 }
  2253 #endif
  2254 
  2255 #ifdef	__SYMBIAN32__
  2256  
  2257 OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8() {
  2258 		return &_oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8;
  2259 }
  2260 #endif
  2261 
  2262 #ifdef	__SYMBIAN32__
  2263  
  2264 OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8() {
  2265 		return &_oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8;
  2266 }
  2267 #endif
  2268 
  2269 #ifdef	__SYMBIAN32__
  2270  
  2271 OilFunctionImpl* __oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx() {
  2272 		return &_oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx;
  2273 }
  2274 #endif
  2275 
  2276 #ifdef	__SYMBIAN32__
  2277  
  2278 OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
  2279 		return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
  2280 }
  2281 #endif
  2282 
  2283 #ifdef	__SYMBIAN32__
  2284  
  2285 OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
  2286 		return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
  2287 }
  2288 #endif
  2289 
  2290 #ifdef	__SYMBIAN32__
  2291  
  2292 OilFunctionImpl* __oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16() {
  2293 		return &_oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16;
  2294 }
  2295 #endif
  2296 
  2297 #ifdef	__SYMBIAN32__
  2298  
  2299 OilFunctionImpl* __oil_function_impl_lshift_s16_mmx, lshift_s16() {
  2300 		return &_oil_function_impl_lshift_s16_mmx, lshift_s16;
  2301 }
  2302 #endif
  2303 
  2304 #ifdef	__SYMBIAN32__
  2305  
  2306 OilFunctionImpl* __oil_function_impl_lshift_s16_mmx_2, lshift_s16() {
  2307 		return &_oil_function_impl_lshift_s16_mmx_2, lshift_s16;
  2308 }
  2309 #endif
  2310 
  2311 
  2312 
  2313 #ifdef	__SYMBIAN32__
  2314  
  2315 OilFunctionImpl* __oil_function_impl_split_53_nomix() {
  2316 		return &_oil_function_impl_split_53_nomix;
  2317 }
  2318 #endif
  2319 
  2320 #ifdef	__SYMBIAN32__
  2321  
  2322 OilFunctionImpl* __oil_function_impl_split_53_c() {
  2323 		return &_oil_function_impl_split_53_c;
  2324 }
  2325 #endif
  2326 
  2327 #ifdef	__SYMBIAN32__
  2328  
  2329 OilFunctionImpl* __oil_function_impl_synth_53_c() {
  2330 		return &_oil_function_impl_synth_53_c;
  2331 }
  2332 #endif
  2333 
  2334 #ifdef	__SYMBIAN32__
  2335  
  2336 OilFunctionImpl* __oil_function_impl_deinterleave2_c_1() {
  2337 		return &_oil_function_impl_deinterleave2_c_1;
  2338 }
  2339 #endif
  2340 
  2341 #ifdef	__SYMBIAN32__
  2342  
  2343 OilFunctionImpl* __oil_function_impl_deinterleave2_asm() {
  2344 		return &_oil_function_impl_deinterleave2_asm;
  2345 }
  2346 #endif
  2347