os/ossrv/genericopenlibs/liboil/src/i386/wavelet.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
//Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
sl@0
     2
/*
sl@0
     3
Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
sl@0
     4
  All rights reserved.
sl@0
     5
  
sl@0
     6
  Redistribution and use in source and binary forms, with or without
sl@0
     7
  modification, are permitted provided that the following conditions
sl@0
     8
  are met:
sl@0
     9
  1. Redistributions of source code must retain the above copyright
sl@0
    10
     notice, this list of conditions and the following disclaimer.
sl@0
    11
  2. Redistributions in binary form must reproduce the above copyright
sl@0
    12
     notice, this list of conditions and the following disclaimer in the
sl@0
    13
     documentation and/or other materials provided with the distribution.
sl@0
    14
  
sl@0
    15
  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
sl@0
    16
  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
sl@0
    17
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
sl@0
    18
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
sl@0
    19
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
sl@0
    20
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
sl@0
    21
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
sl@0
    22
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
sl@0
    23
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
sl@0
    24
  IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
sl@0
    25
  POSSIBILITY OF SUCH DAMAGE.
sl@0
    26
*/
sl@0
    27
sl@0
    28
#include <liboil/liboilfunction.h>
sl@0
    29
#include <liboil/liboilclasses.h>
sl@0
    30
sl@0
    31
sl@0
    32
void
sl@0
    33
split_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
sl@0
    34
{
sl@0
    35
  int i;
sl@0
    36
    
sl@0
    37
  if (n == 0) return;
sl@0
    38
  /* predict */
sl@0
    39
  for(i=1;i<n*2-2;i+=2){
sl@0
    40
    d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 1);
sl@0
    41
  }
sl@0
    42
  d_2xn[n*2-1] = s_2xn[n*2-1] - s_2xn[n*2-2];
sl@0
    43
sl@0
    44
  /* update */
sl@0
    45
  d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
sl@0
    46
  for(i=2;i<n*2;i+=2){
sl@0
    47
    d_2xn[i] = s_2xn[i] + ((d_2xn[i-1] + d_2xn[i+1]) >> 2);
sl@0
    48
  }
sl@0
    49
} 
sl@0
    50
OIL_DEFINE_IMPL (split_53_nomix, split_53);
sl@0
    51
  
sl@0
    52
#if 0
sl@0
    53
void
sl@0
    54
synth_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
sl@0
    55
{ 
sl@0
    56
  int i;
sl@0
    57
    
sl@0
    58
  /* predict */ 
sl@0
    59
  i_n[0] -= i_n[1] >> 1;
sl@0
    60
  for(i=2;i<n*2;i+=2){
sl@0
    61
    i_n[i] -= (i_n[i-1] + i_n[i+1]) >> 2;
sl@0
    62
  }
sl@0
    63
  
sl@0
    64
  /* update */
sl@0
    65
  for(i=1;i<n*2-2;i+=2){
sl@0
    66
    i_n[i] += (i_n[i+1] + i_n[i-1]) >> 1;
sl@0
    67
  }
sl@0
    68
  i_n[n*2-1] += i_n[n*2-2];
sl@0
    69
}
sl@0
    70
#endif
sl@0
    71
sl@0
    72
sl@0
    73
void
sl@0
    74
split_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
sl@0
    75
{
sl@0
    76
  int i;
sl@0
    77
sl@0
    78
  if (n == 0) return;
sl@0
    79
  if (n == 1) {
sl@0
    80
    d_2xn[1] = s_2xn[1] - s_2xn[0];
sl@0
    81
    d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
sl@0
    82
  } else {
sl@0
    83
    d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
sl@0
    84
    d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
sl@0
    85
    d_2xn+=2;
sl@0
    86
    s_2xn+=2;
sl@0
    87
    for(i=0;i<(n*2-4)/2;i++){
sl@0
    88
      d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
sl@0
    89
      d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
sl@0
    90
      d_2xn+=2;
sl@0
    91
      s_2xn+=2;
sl@0
    92
    }
sl@0
    93
    d_2xn[1] = s_2xn[1] - s_2xn[0];
sl@0
    94
    d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
sl@0
    95
  }
sl@0
    96
}
sl@0
    97
OIL_DEFINE_IMPL (split_53_c, split_53);
sl@0
    98
sl@0
    99
void
sl@0
   100
synth_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
sl@0
   101
{
sl@0
   102
  int i;
sl@0
   103
sl@0
   104
  if (n == 0) return;
sl@0
   105
  if (n == 1) {
sl@0
   106
    d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
sl@0
   107
    d_2xn[1] = s_2xn[1] + d_2xn[0];
sl@0
   108
  } else {
sl@0
   109
    d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
sl@0
   110
    for(i=2;i<n*2-2;i+=2){
sl@0
   111
      d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
sl@0
   112
      d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
sl@0
   113
    }
sl@0
   114
    d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
sl@0
   115
    d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
sl@0
   116
    d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
sl@0
   117
  }
sl@0
   118
}
sl@0
   119
OIL_DEFINE_IMPL (synth_53_c, synth_53);
sl@0
   120
sl@0
   121
void
sl@0
   122
deinterleave2_c_1 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   123
{
sl@0
   124
  int i;
sl@0
   125
sl@0
   126
  for(i=0;i<n;i++) {
sl@0
   127
    d1[i] = s_2xn[2*i];
sl@0
   128
    d2[i] = s_2xn[2*i + 1];
sl@0
   129
  }
sl@0
   130
}
sl@0
   131
OIL_DEFINE_IMPL (deinterleave2_c_1, deinterleave2_s16);
sl@0
   132
sl@0
   133
void
sl@0
   134
deinterleave2_asm (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   135
{
sl@0
   136
  if (n == 0) return;
sl@0
   137
sl@0
   138
  while (n&1) {
sl@0
   139
    d1[0] = s_2xn[0];
sl@0
   140
    d2[0] = s_2xn[1];
sl@0
   141
    d1++;
sl@0
   142
    d2++;
sl@0
   143
    s_2xn+=2;
sl@0
   144
    n--;
sl@0
   145
  }
sl@0
   146
sl@0
   147
  asm volatile ("\n"
sl@0
   148
      "  mov %3, %%ecx\n"
sl@0
   149
      "  sub $2, %%ecx\n"
sl@0
   150
      "1:\n"
sl@0
   151
      "  movw (%1,%%ecx,4), %%ax\n"
sl@0
   152
      "  movw %%ax, (%0,%%ecx,2)\n"
sl@0
   153
      "  movw 2(%1,%%ecx,4), %%ax\n"
sl@0
   154
      "  movw %%ax, (%2,%%ecx,2)\n"
sl@0
   155
      "  movw 4(%1,%%ecx,4), %%ax\n"
sl@0
   156
      "  movw %%ax, 2(%0,%%ecx,2)\n"
sl@0
   157
      "  movw 6(%1,%%ecx,4), %%ax\n"
sl@0
   158
      "  movw %%ax, 2(%2,%%ecx,2)\n"
sl@0
   159
      "  sub $2, %%ecx\n"
sl@0
   160
      "  jge 1b\n"
sl@0
   161
      : "+r" (d1), "+r" (s_2xn), "+r" (d2)
sl@0
   162
      : "m" (n)
sl@0
   163
      : "eax", "ecx");
sl@0
   164
}
sl@0
   165
OIL_DEFINE_IMPL (deinterleave2_asm, deinterleave2_s16);
sl@0
   166
sl@0
   167
void
sl@0
   168
deinterleave2_mmx (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   169
{
sl@0
   170
  while (n&3) {
sl@0
   171
    d1[0] = s_2xn[0];
sl@0
   172
    d2[0] = s_2xn[1];
sl@0
   173
    d1++;
sl@0
   174
    d2++;
sl@0
   175
    s_2xn+=2;
sl@0
   176
    n--;
sl@0
   177
  }
sl@0
   178
  if (n==0) return;
sl@0
   179
sl@0
   180
  asm volatile ("\n"
sl@0
   181
      "  xor %%ecx, %%ecx\n"
sl@0
   182
      "1:\n"
sl@0
   183
      "  movq (%1,%%ecx,4), %%mm0\n"
sl@0
   184
      "  movq 8(%1,%%ecx,4), %%mm1\n"
sl@0
   185
      "  pslld $16, %%mm0\n"
sl@0
   186
      "  pslld $16, %%mm1\n"
sl@0
   187
      "  psrad $16, %%mm0\n"
sl@0
   188
      "  psrad $16, %%mm1\n"
sl@0
   189
      "  packssdw %%mm1, %%mm0\n"
sl@0
   190
      "  movq %%mm0, (%0,%%ecx,2)\n"
sl@0
   191
      "  movq (%1,%%ecx,4), %%mm0\n"
sl@0
   192
      "  movq 8(%1,%%ecx,4), %%mm1\n"
sl@0
   193
      "  psrad $16, %%mm0\n"
sl@0
   194
      "  psrad $16, %%mm1\n"
sl@0
   195
      "  packssdw %%mm1, %%mm0\n"
sl@0
   196
      "  movq %%mm0, (%2,%%ecx,2)\n"
sl@0
   197
      "  add $4, %%ecx\n"
sl@0
   198
      "  cmp %3, %%ecx\n"
sl@0
   199
      "  jl 1b\n"
sl@0
   200
      "  emms\n"
sl@0
   201
      : "+r" (d1), "+r" (s_2xn), "+r" (d2)
sl@0
   202
      : "m" (n)
sl@0
   203
      : "eax", "ecx");
sl@0
   204
}
sl@0
   205
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
sl@0
   206
sl@0
   207
void
sl@0
   208
deinterleave2_mmx_2 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   209
{
sl@0
   210
  while (n&3) {
sl@0
   211
    d1[0] = s_2xn[0];
sl@0
   212
    d2[0] = s_2xn[1];
sl@0
   213
    d1++;
sl@0
   214
    d2++;
sl@0
   215
    s_2xn+=2;
sl@0
   216
    n--;
sl@0
   217
  }
sl@0
   218
  if (n==0) return;
sl@0
   219
sl@0
   220
  asm volatile ("\n"
sl@0
   221
      "  xor %%ecx, %%ecx\n"
sl@0
   222
      "1:\n"
sl@0
   223
      "  pshufw $0xd8, (%1,%%ecx,4), %%mm0\n"
sl@0
   224
      "  movd %%mm0, (%0,%%ecx,2)\n"
sl@0
   225
      "  pshufw $0x8d, (%1,%%ecx,4), %%mm0\n"
sl@0
   226
      "  movd %%mm0, (%2,%%ecx,2)\n"
sl@0
   227
      "  add $2, %%ecx\n"
sl@0
   228
      "  cmp %3, %%ecx\n"
sl@0
   229
      "  jl 1b\n"
sl@0
   230
      "  emms\n"
sl@0
   231
      : "+r" (d1), "+r" (s_2xn), "+r" (d2)
sl@0
   232
      : "m" (n)
sl@0
   233
      : "eax", "ecx");
sl@0
   234
}
sl@0
   235
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_2, deinterleave2_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
   236
sl@0
   237
void
sl@0
   238
deinterleave2_mmx_3 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   239
{
sl@0
   240
  while (n&3) {
sl@0
   241
    d1[0] = s_2xn[0];
sl@0
   242
    d2[0] = s_2xn[1];
sl@0
   243
    d1++;
sl@0
   244
    d2++;
sl@0
   245
    s_2xn+=2;
sl@0
   246
    n--;
sl@0
   247
  }
sl@0
   248
  if (n==0) return;
sl@0
   249
sl@0
   250
  asm volatile ("\n"
sl@0
   251
      "  xor %%ecx, %%ecx\n"
sl@0
   252
      "1:\n"
sl@0
   253
      "  movq (%1,%%ecx,4), %%mm1\n"
sl@0
   254
      "  movq (%1,%%ecx,4), %%mm2\n"
sl@0
   255
      "  movq 8(%1,%%ecx,4), %%mm0\n"
sl@0
   256
      "  punpcklwd %%mm0, %%mm1\n"
sl@0
   257
      "  punpckhwd %%mm0, %%mm2\n"
sl@0
   258
      "  movq %%mm1, %%mm0\n"
sl@0
   259
      "  punpcklwd %%mm2, %%mm0\n"
sl@0
   260
      "  punpckhwd %%mm2, %%mm1\n"
sl@0
   261
      "  movq %%mm0, (%0,%%ecx,2)\n"
sl@0
   262
      "  movq %%mm1, (%2,%%ecx,2)\n"
sl@0
   263
      "  add $4, %%ecx\n"
sl@0
   264
      "  cmp %3, %%ecx\n"
sl@0
   265
      "  jl 1b\n"
sl@0
   266
      "  emms\n"
sl@0
   267
      : "+r" (d1), "+r" (s_2xn), "+r" (d2)
sl@0
   268
      : "m" (n)
sl@0
   269
      : "eax", "ecx");
sl@0
   270
}
sl@0
   271
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_3, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
sl@0
   272
sl@0
   273
void
sl@0
   274
deinterleave2_mmx_4 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
sl@0
   275
{
sl@0
   276
  while (n&7) {
sl@0
   277
    d1[0] = s_2xn[0];
sl@0
   278
    d2[0] = s_2xn[1];
sl@0
   279
    d1++;
sl@0
   280
    d2++;
sl@0
   281
    s_2xn+=2;
sl@0
   282
    n--;
sl@0
   283
  }
sl@0
   284
  if (n==0) return;
sl@0
   285
sl@0
   286
  asm volatile ("\n"
sl@0
   287
      "  xor %%ecx, %%ecx\n"
sl@0
   288
      "1:\n"
sl@0
   289
      "  movq (%1,%%ecx,4), %%mm1\n"
sl@0
   290
      "  movq %%mm1, %%mm2\n"
sl@0
   291
      "  movq 8(%1,%%ecx,4), %%mm0\n"
sl@0
   292
      "   movq 16(%1,%%ecx,4), %%mm5\n"
sl@0
   293
      "  punpcklwd %%mm0, %%mm1\n"
sl@0
   294
      "   movq %%mm5, %%mm6\n"
sl@0
   295
      "  punpckhwd %%mm0, %%mm2\n"
sl@0
   296
      "   movq 24(%1,%%ecx,4), %%mm4\n"
sl@0
   297
      "  movq %%mm1, %%mm0\n"
sl@0
   298
      "   punpcklwd %%mm4, %%mm5\n"
sl@0
   299
      "  punpcklwd %%mm2, %%mm0\n"
sl@0
   300
      "   punpckhwd %%mm4, %%mm6\n"
sl@0
   301
      "  punpckhwd %%mm2, %%mm1\n"
sl@0
   302
      "   movq %%mm5, %%mm4\n"
sl@0
   303
      "  movq %%mm0, (%0,%%ecx,2)\n"
sl@0
   304
      "   punpcklwd %%mm6, %%mm4\n"
sl@0
   305
      "  movq %%mm1, (%2,%%ecx,2)\n"
sl@0
   306
      "   punpckhwd %%mm6, %%mm5\n"
sl@0
   307
      "   movq %%mm4, 8(%0,%%ecx,2)\n"
sl@0
   308
      "   movq %%mm5, 8(%2,%%ecx,2)\n"
sl@0
   309
      "  add $8, %%ecx\n"
sl@0
   310
      "  cmp %3, %%ecx\n"
sl@0
   311
      "  jl 1b\n"
sl@0
   312
      "  emms\n"
sl@0
   313
      : "+r" (d1), "+r" (s_2xn), "+r" (d2)
sl@0
   314
      : "m" (n)
sl@0
   315
      : "eax", "ecx");
sl@0
   316
}
sl@0
   317
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_4, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
sl@0
   318
sl@0
   319
sl@0
   320
void
sl@0
   321
lift_add_mult_shift12_i386_mmx (int16_t *d, int16_t *s1, int16_t *s2,
sl@0
   322
    int16_t *s3, int16_t *s4, int n)
sl@0
   323
{
sl@0
   324
  uint32_t val = *s4;
sl@0
   325
sl@0
   326
  while (n&3) {
sl@0
   327
    d[0] = s1[0] + ((s4[0]*(s2[0] + s3[0]))>>12);
sl@0
   328
    d++;
sl@0
   329
    s1++;
sl@0
   330
    s2++;
sl@0
   331
    s3++;
sl@0
   332
    n--;
sl@0
   333
  }
sl@0
   334
  if (n==0) return;
sl@0
   335
sl@0
   336
  val = ((*(uint16_t *)s4)<<16) | (*(uint16_t *)s4);
sl@0
   337
  n>>=2;
sl@0
   338
  asm volatile ("\n"
sl@0
   339
      "  mov %4, %%ecx\n"
sl@0
   340
      "  movd %%ecx, %%mm7\n"
sl@0
   341
      "  punpcklwd %%mm7, %%mm7\n"
sl@0
   342
      "  mov %5, %%ecx\n"
sl@0
   343
      "1:\n"
sl@0
   344
      "  movq 0(%2), %%mm0\n"
sl@0
   345
      "  paddsw 0(%3), %%mm0\n"
sl@0
   346
      "  movq %%mm0, %%mm1\n"
sl@0
   347
      "  pmullw %%mm7, %%mm0\n"
sl@0
   348
      "  pmulhw %%mm7, %%mm1\n"
sl@0
   349
      "  psrlw $12, %%mm0\n"
sl@0
   350
      "  psllw $4, %%mm1\n"
sl@0
   351
      "  por %%mm1, %%mm0\n"
sl@0
   352
      "  paddsw 0(%1), %%mm0\n"
sl@0
   353
      "  movq %%mm0, 0(%0)\n"
sl@0
   354
      "  add $8, %0\n"
sl@0
   355
      "  add $8, %1\n"
sl@0
   356
      "  add $8, %2\n"
sl@0
   357
      "  add $8, %3\n"
sl@0
   358
      "  decl %%ecx\n"
sl@0
   359
      "  jne 1b\n"
sl@0
   360
      "  emms\n"
sl@0
   361
      : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
sl@0
   362
      : "m" (val), "m" (n)
sl@0
   363
      : "ecx");
sl@0
   364
}
sl@0
   365
OIL_DEFINE_IMPL_FULL (lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12, OIL_IMPL_FLAG_MMX);
sl@0
   366
sl@0
   367
void
sl@0
   368
interleave2_mmx (int16_t *d_2xn, int16_t *s1, int16_t *s2, int n)
sl@0
   369
{
sl@0
   370
  while (n&3) {
sl@0
   371
    d_2xn[0] = s1[0];
sl@0
   372
    d_2xn[1] = s2[0];
sl@0
   373
    s1++;
sl@0
   374
    s2++;
sl@0
   375
    d_2xn+=2;
sl@0
   376
    n--;
sl@0
   377
  }
sl@0
   378
  if (n==0) return;
sl@0
   379
sl@0
   380
  asm volatile ("\n"
sl@0
   381
      "  xor %%ecx, %%ecx\n"
sl@0
   382
      "1:\n"
sl@0
   383
      "  movq (%1,%%ecx,2), %%mm0\n"
sl@0
   384
      "  movq (%2,%%ecx,2), %%mm1\n"
sl@0
   385
      "  movq %%mm0, %%mm2\n"
sl@0
   386
      "  punpckhwd %%mm1, %%mm0\n"
sl@0
   387
      "  punpcklwd %%mm1, %%mm2\n"
sl@0
   388
      "  movq %%mm2, (%0,%%ecx,4)\n"
sl@0
   389
      "  movq %%mm0, 8(%0,%%ecx,4)\n"
sl@0
   390
      "  add $4, %%ecx\n"
sl@0
   391
      "  cmp %3, %%ecx\n"
sl@0
   392
      "  jl 1b\n"
sl@0
   393
      "  emms\n"
sl@0
   394
      : "+r" (d_2xn), "+r" (s1), "+r" (s2)
sl@0
   395
      : "m" (n)
sl@0
   396
      : "eax", "ecx");
sl@0
   397
}
sl@0
   398
OIL_DEFINE_IMPL_FULL (interleave2_mmx, interleave2_s16, OIL_IMPL_FLAG_MMX);
sl@0
   399
sl@0
   400
void
sl@0
   401
lift_add_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
sl@0
   402
{
sl@0
   403
  while (n&3) {
sl@0
   404
    d[0] = s1[0] + ((s2[0] + s3[0])>>1);
sl@0
   405
    d++;
sl@0
   406
    s1++;
sl@0
   407
    s2++;
sl@0
   408
    s3++;
sl@0
   409
    n--;
sl@0
   410
  }
sl@0
   411
  if (n==0) return;
sl@0
   412
sl@0
   413
  asm volatile ("\n"
sl@0
   414
      "  xor %%ecx, %%ecx\n"
sl@0
   415
      "1:\n"
sl@0
   416
      "  movq (%2,%%ecx,2), %%mm1\n"
sl@0
   417
      "  movq (%3,%%ecx,2), %%mm2\n"
sl@0
   418
      "  paddw %%mm2, %%mm1\n"
sl@0
   419
      "  psraw $1, %%mm1\n"
sl@0
   420
      "  paddw (%1,%%ecx,2), %%mm1\n"
sl@0
   421
      "  movq %%mm1, (%0,%%ecx,2)\n"
sl@0
   422
      "  add $4, %%ecx\n"
sl@0
   423
      "  cmp %4, %%ecx\n"
sl@0
   424
      "  jl 1b\n"
sl@0
   425
      "  emms\n"
sl@0
   426
      : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
sl@0
   427
      : "m" (n)
sl@0
   428
      : "ecx");
sl@0
   429
}
sl@0
   430
OIL_DEFINE_IMPL_FULL (lift_add_shift1_mmx, lift_add_shift1, OIL_IMPL_FLAG_MMX);
sl@0
   431
sl@0
   432
void
sl@0
   433
lift_sub_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
sl@0
   434
{
sl@0
   435
  while (n&3) {
sl@0
   436
    d[0] = s1[0] - ((s2[0] + s3[0])>>1);
sl@0
   437
    d++;
sl@0
   438
    s1++;
sl@0
   439
    s2++;
sl@0
   440
    s3++;
sl@0
   441
    n--;
sl@0
   442
  }
sl@0
   443
  if (n==0) return;
sl@0
   444
sl@0
   445
  asm volatile ("\n"
sl@0
   446
      "  xor %%ecx, %%ecx\n"
sl@0
   447
      "1:\n"
sl@0
   448
      "  movq (%2,%%ecx,2), %%mm1\n"
sl@0
   449
      "  movq (%3,%%ecx,2), %%mm2\n"
sl@0
   450
      "  movq (%1,%%ecx,2), %%mm0\n"
sl@0
   451
      "  paddw %%mm2, %%mm1\n"
sl@0
   452
      "  psraw $1, %%mm1\n"
sl@0
   453
      "  psubw %%mm1, %%mm0\n"
sl@0
   454
      "  movq %%mm0, (%0,%%ecx,2)\n"
sl@0
   455
      "  add $4, %%ecx\n"
sl@0
   456
      "  cmp %4, %%ecx\n"
sl@0
   457
      "  jl 1b\n"
sl@0
   458
      "  emms\n"
sl@0
   459
      : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
sl@0
   460
      : "m" (n)
sl@0
   461
      : "ecx");
sl@0
   462
}
sl@0
   463
OIL_DEFINE_IMPL_FULL (lift_sub_shift1_mmx, lift_sub_shift1, OIL_IMPL_FLAG_MMX);
sl@0
   464
sl@0
   465
void
sl@0
   466
lift_add_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
sl@0
   467
{
sl@0
   468
  while (n&3) {
sl@0
   469
    d[0] = s1[0] + ((s2[0] + s3[0])>>2);
sl@0
   470
    d++;
sl@0
   471
    s1++;
sl@0
   472
    s2++;
sl@0
   473
    s3++;
sl@0
   474
    n--;
sl@0
   475
  }
sl@0
   476
  if (n==0) return;
sl@0
   477
sl@0
   478
  asm volatile ("\n"
sl@0
   479
      "  xor %%ecx, %%ecx\n"
sl@0
   480
      "1:\n"
sl@0
   481
      "  movq (%2,%%ecx,2), %%mm1\n"
sl@0
   482
      "  movq (%3,%%ecx,2), %%mm2\n"
sl@0
   483
      "  paddw %%mm2, %%mm1\n"
sl@0
   484
      "  psraw $2, %%mm1\n"
sl@0
   485
      "  paddw (%1,%%ecx,2), %%mm1\n"
sl@0
   486
      "  movq %%mm1, (%0,%%ecx,2)\n"
sl@0
   487
      "  add $4, %%ecx\n"
sl@0
   488
      "  cmp %4, %%ecx\n"
sl@0
   489
      "  jl 1b\n"
sl@0
   490
      "  emms\n"
sl@0
   491
      : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
sl@0
   492
      : "m" (n)
sl@0
   493
      : "ecx");
sl@0
   494
}
sl@0
   495
OIL_DEFINE_IMPL_FULL (lift_add_shift2_mmx, lift_add_shift2, OIL_IMPL_FLAG_MMX);
sl@0
   496
sl@0
   497
void
sl@0
   498
lift_sub_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
sl@0
   499
{
sl@0
   500
  while (n&3) {
sl@0
   501
    d[0] = s1[0] - ((s2[0] + s3[0])>>2);
sl@0
   502
    d++;
sl@0
   503
    s1++;
sl@0
   504
    s2++;
sl@0
   505
    s3++;
sl@0
   506
    n--;
sl@0
   507
  }
sl@0
   508
  if (n==0) return;
sl@0
   509
sl@0
   510
  asm volatile ("\n"
sl@0
   511
      "  xor %%ecx, %%ecx\n"
sl@0
   512
      "1:\n"
sl@0
   513
      "  movq (%2,%%ecx,2), %%mm1\n"
sl@0
   514
      "  movq (%3,%%ecx,2), %%mm2\n"
sl@0
   515
      "  movq (%1,%%ecx,2), %%mm0\n"
sl@0
   516
      "  paddw %%mm2, %%mm1\n"
sl@0
   517
      "  psraw $2, %%mm1\n"
sl@0
   518
      "  psubw %%mm1, %%mm0\n"
sl@0
   519
      "  movq %%mm0, (%0,%%ecx,2)\n"
sl@0
   520
      "  add $4, %%ecx\n"
sl@0
   521
      "  cmp %4, %%ecx\n"
sl@0
   522
      "  jl 1b\n"
sl@0
   523
      "  emms\n"
sl@0
   524
      : "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
sl@0
   525
      : "m" (n)
sl@0
   526
      : "ecx");
sl@0
   527
}
sl@0
   528
OIL_DEFINE_IMPL_FULL (lift_sub_shift2_mmx, lift_sub_shift2, OIL_IMPL_FLAG_MMX);
sl@0
   529
sl@0
   530
#ifdef ENABLE_BROKEN_IMPLS
sl@0
   531
void
sl@0
   532
synth_53_mmx (int16_t *d_2xn, int16_t *s_2xn, int n)
sl@0
   533
{
sl@0
   534
  int i;
sl@0
   535
    
sl@0
   536
  if (n==0) return;
sl@0
   537
  if (n == 1) {
sl@0
   538
    d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
sl@0
   539
    d_2xn[1] = s_2xn[1] + d_2xn[0]; 
sl@0
   540
  } else {
sl@0
   541
    int i;
sl@0
   542
sl@0
   543
    d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
sl@0
   544
sl@0
   545
    if (n > 6) {
sl@0
   546
      n-=5;
sl@0
   547
sl@0
   548
      asm volatile ("\n"
sl@0
   549
          "  xor %%ecx, %%ecx\n"
sl@0
   550
          "  movw 2(%1), %%ecx\n"
sl@0
   551
          "  movd %%ecx, %%mm7\n"
sl@0
   552
          "  movw 0(%0), %%ecx\n"
sl@0
   553
          "  movd %%ecx, %%mm6\n"
sl@0
   554
          "  movw 0(%1), %%ecx\n"
sl@0
   555
          "  movd %%ecx, %%mm5\n"
sl@0
   556
sl@0
   557
          "  xor %%ecx, %%ecx\n"
sl@0
   558
          "1:\n"
sl@0
   559
          "  movq 4(%1,%%ecx,4), %%mm1\n"  // mm1 = s5 s4 s3 s2
sl@0
   560
          "  movq %%mm1, %%mm2\n"          // mm2 = s5 s4 s3 s2
sl@0
   561
          "  movq 12(%1,%%ecx,4), %%mm0\n" // mm0 = s9 s8 s7 s6
sl@0
   562
          "  punpcklwd %%mm0, %%mm1\n"     // mm1 = s7 s3 s6 s2
sl@0
   563
          "  punpckhwd %%mm0, %%mm2\n"     // mm2 = s9 s5 s8 s4
sl@0
   564
          "  movq %%mm1, %%mm0\n"          // mm0 = s7 s3 s6 s2
sl@0
   565
          "  punpcklwd %%mm2, %%mm0\n"     // mm0 = s8 s6 s4 s2
sl@0
   566
          "  punpckhwd %%mm2, %%mm1\n"     // mm1 = s9 s7 s5 s3
sl@0
   567
          //"  movq %%mm0, %%mm3\n"          // mm0 = s8 s6 s4 s2
sl@0
   568
sl@0
   569
          "  movq %%mm1, %%mm2\n"          // mm2 = s9 s7 s5 s3
sl@0
   570
          "  psllq $16, %%mm2\n"           // mm2 = s7 s5 s3 00
sl@0
   571
          "  por %%mm7, %%mm2\n"           // mm2 = s7 s5 s3 s1
sl@0
   572
          "  movq %%mm2, %%mm4\n"          // mm4 = s7 s5 s3 s1
sl@0
   573
          "  paddw %%mm1, %%mm2\n"         // mm2 = s9+s7 ...
sl@0
   574
          "  psraw $2, %%mm2\n"            // mm2 = (s9+s7)>>2 ...
sl@0
   575
          "  movq %%mm1, %%mm7\n"          // mm7 = s9 s7 s5 s3
sl@0
   576
          "  psrlq $48, %%mm7\n"           // mm7 = 00 00 00 s9
sl@0
   577
          "  psubw %%mm2, %%mm0\n"         // mm0 = d8 d6 d4 d2
sl@0
   578
sl@0
   579
          "  movq %%mm0, %%mm1\n"          // mm1 = d8 d6 d4 d2
sl@0
   580
          "  movq %%mm0, %%mm3\n"          // mm1 = d8 d6 d4 d2
sl@0
   581
          "  psllq $16, %%mm0\n"           // mm0 = d6 d4 d2 00
sl@0
   582
          "  por %%mm6, %%mm0\n"           // mm0 = d6 d4 d2 d0
sl@0
   583
          "  psrlq $48, %%mm1\n"           // mm1 = 00 00 00 d8
sl@0
   584
          "  movq %%mm1, %%mm6\n"          // mm6 = 00 00 00 d8
sl@0
   585
sl@0
   586
          "  movq %%mm0, %%mm1\n"
sl@0
   587
          "  paddw %%mm3, %%mm1\n"         // mm0 = d8+d6 ...
sl@0
   588
          "  psraw $1, %%mm1\n"            // mm1 = (d8+d6)>>1 ...
sl@0
   589
          "  paddw %%mm4, %%mm1\n"         // mm1 = d7 d5 d3 d1
sl@0
   590
sl@0
   591
          "  movq %%mm1, %%mm2\n"
sl@0
   592
sl@0
   593
          "  movq %%mm0, %%mm1\n"
sl@0
   594
          "  punpcklwd %%mm2, %%mm0\n"
sl@0
   595
          "  punpckhwd %%mm2, %%mm1\n"
sl@0
   596
sl@0
   597
          "  movq %%mm0, (%0, %%ecx, 4)\n"
sl@0
   598
          "  movq %%mm1, 8(%0, %%ecx, 4)\n"
sl@0
   599
sl@0
   600
          "  add $4, %%ecx\n"
sl@0
   601
          "  cmp %3, %%ecx\n"
sl@0
   602
          "  jl 1b\n"
sl@0
   603
          "  emms\n"
sl@0
   604
          : "+r" (d_2xn), "+r" (s_2xn), "+ecx" (i)
sl@0
   605
          : "m" (n));
sl@0
   606
sl@0
   607
      i*=2;
sl@0
   608
      n+=5;
sl@0
   609
      d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
sl@0
   610
      i+=2;
sl@0
   611
    } else {
sl@0
   612
      i = 2;
sl@0
   613
    }
sl@0
   614
    for(;i<n*2-2;i+=2){
sl@0
   615
      d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
sl@0
   616
      d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
sl@0
   617
    }
sl@0
   618
    d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
sl@0
   619
    d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
sl@0
   620
    d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
sl@0
   621
  } 
sl@0
   622
}
sl@0
   623
OIL_DEFINE_IMPL_FULL (synth_53_mmx, synth_53, OIL_IMPL_FLAG_MMX);
sl@0
   624
#endif
sl@0
   625
sl@0
   626
sl@0
   627
void
sl@0
   628
mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
   629
    int16_t *s4_2, int n)
sl@0
   630
{
sl@0
   631
  int shift = s4_2[1];
sl@0
   632
sl@0
   633
  while (n&3) {
sl@0
   634
    int x;
sl@0
   635
sl@0
   636
    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
sl@0
   637
    x >>= s4_2[1];
sl@0
   638
    d1[0] = s1[0] + x;
sl@0
   639
sl@0
   640
    d1++;
sl@0
   641
    s1++;
sl@0
   642
    s2++;
sl@0
   643
    n--;
sl@0
   644
  }
sl@0
   645
  if (n==0) return;
sl@0
   646
sl@0
   647
  n>>=2;
sl@0
   648
  asm volatile ("\n"
sl@0
   649
      "  movzwl 0(%0), %%ecx\n"
sl@0
   650
      "  movd %%ecx, %%mm7\n"
sl@0
   651
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
   652
      "  movzwl 2(%0), %%ecx\n"
sl@0
   653
      "  movd %%ecx, %%mm6\n"
sl@0
   654
      "  pshufw $0x00, %%mm6, %%mm6\n"
sl@0
   655
      "  movzwl 0(%1), %%ecx\n"
sl@0
   656
      "  movd %%ecx, %%mm5\n"
sl@0
   657
      "  pshufw $0x44, %%mm5, %%mm5\n"
sl@0
   658
      :: "r" (s3_2), "r" (s4_2)
sl@0
   659
      : "ecx"
sl@0
   660
      );
sl@0
   661
  asm volatile ("\n"
sl@0
   662
      "1:\n"
sl@0
   663
      "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
sl@0
   664
      "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
sl@0
   665
      "  pmullw %%mm7, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
sl@0
   666
      "  pmulhw %%mm7, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
sl@0
   667
      "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
sl@0
   668
      "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
sl@0
   669
      "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
sl@0
   670
      "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
sl@0
   671
sl@0
   672
      "  movq 2(%2), %%mm2\n"
sl@0
   673
      "  movq 2(%2), %%mm3\n"
sl@0
   674
      "  pmullw %%mm6, %%mm2\n"
sl@0
   675
      "  pmulhw %%mm6, %%mm3\n"
sl@0
   676
      "  movq %%mm2, %%mm4\n"
sl@0
   677
      "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
sl@0
   678
      "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
sl@0
   679
      "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
sl@0
   680
sl@0
   681
      "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
sl@0
   682
      "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
sl@0
   683
sl@0
   684
      "  paddd %%mm5, %%mm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
sl@0
   685
      "  paddd %%mm5, %%mm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
sl@0
   686
sl@0
   687
      "  movd %4, %%mm4\n"
sl@0
   688
      "  psrad %%mm4, %%mm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
sl@0
   689
      "  psrad %%mm4, %%mm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
sl@0
   690
sl@0
   691
      "  packssdw %%mm1, %%mm0\n"
sl@0
   692
      "  paddw 0(%1), %%mm0\n"
sl@0
   693
      "  movq %%mm0, 0(%0)\n"
sl@0
   694
      "  add $8, %0\n"
sl@0
   695
      "  add $8, %1\n"
sl@0
   696
      "  add $8, %2\n"
sl@0
   697
      "  decl %3\n"
sl@0
   698
      "  jnz 1b\n"
sl@0
   699
      "  emms\n"
sl@0
   700
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
   701
      : "r" (shift)
sl@0
   702
      );
sl@0
   703
}
sl@0
   704
OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
   705
sl@0
   706
#if 0
sl@0
   707
void
sl@0
   708
mas2_add_s16_lim_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
   709
    int16_t *s4_2, int n)
sl@0
   710
{
sl@0
   711
  int shift = s4_2[1];
sl@0
   712
sl@0
   713
  while (n&3) {
sl@0
   714
    int x;
sl@0
   715
sl@0
   716
    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
sl@0
   717
    x >>= s4_2[1];
sl@0
   718
    d1[0] = s1[0] + x;
sl@0
   719
sl@0
   720
    d1++;
sl@0
   721
    s1++;
sl@0
   722
    s2++;
sl@0
   723
    n--;
sl@0
   724
  }
sl@0
   725
  if (n==0) return;
sl@0
   726
sl@0
   727
  n>>=2;
sl@0
   728
  asm volatile ("\n"
sl@0
   729
      "  movzwl 0(%0), %%ecx\n"
sl@0
   730
      "  movd %%ecx, %%mm7\n"
sl@0
   731
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
   732
      "  movzwl 2(%0), %%ecx\n"
sl@0
   733
      "  movd %%ecx, %%mm6\n"
sl@0
   734
      "  pshufw $0x00, %%mm6, %%mm6\n"
sl@0
   735
      "  movzwl 0(%1), %%ecx\n"
sl@0
   736
      "  movd %%ecx, %%mm5\n"
sl@0
   737
      "  pshufw $0x44, %%mm5, %%mm5\n"
sl@0
   738
      :: "r" (s3_2), "r" (s4_2)
sl@0
   739
      : "ecx"
sl@0
   740
      );
sl@0
   741
  asm volatile ("\n"
sl@0
   742
      "1:\n"
sl@0
   743
      "  movq 0(%2), %%mm0\n"
sl@0
   744
      "  paddq 2(%2), %%mm0\n"
sl@0
   745
sl@0
   746
      "  movd %4, %%mm4\n"
sl@0
   747
      "  psraw %%mm4, %%mm0\n"
sl@0
   748
sl@0
   749
      "  paddw 0(%1), %%mm0\n"
sl@0
   750
      "  movq %%mm0, 0(%0)\n"
sl@0
   751
      "  add $8, %0\n"
sl@0
   752
      "  add $8, %1\n"
sl@0
   753
      "  add $8, %2\n"
sl@0
   754
      "  decl %3\n"
sl@0
   755
      "  jnz 1b\n"
sl@0
   756
      "  emms\n"
sl@0
   757
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
   758
      : "r" (shift)
sl@0
   759
      );
sl@0
   760
}
sl@0
   761
OIL_DEFINE_IMPL_FULL (mas2_add_s16_lim_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
   762
#endif
sl@0
   763
sl@0
   764
void
sl@0
   765
mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_4,
sl@0
   766
    int16_t *s4_2, int n)
sl@0
   767
{
sl@0
   768
  int shift = s4_2[1];
sl@0
   769
  //int m;
sl@0
   770
sl@0
   771
  //m = n&3;
sl@0
   772
#if 1
sl@0
   773
  while (n&3) {
sl@0
   774
    int x;
sl@0
   775
    int i;
sl@0
   776
sl@0
   777
    x = s4_2[0];
sl@0
   778
    for(i=0;i<4;i++){
sl@0
   779
      x += s2[i]*s3_4[i];
sl@0
   780
    }
sl@0
   781
    x >>= s4_2[1];
sl@0
   782
    d1[0] = s1[0] + x;
sl@0
   783
sl@0
   784
    d1++;
sl@0
   785
    s1++;
sl@0
   786
    s2++;
sl@0
   787
    n--;
sl@0
   788
  }
sl@0
   789
#endif
sl@0
   790
  if (n==0) return;
sl@0
   791
sl@0
   792
  n>>=2;
sl@0
   793
  asm volatile ("\n"
sl@0
   794
      "  movq 0(%0), %%mm7\n"
sl@0
   795
      "  movzwl 0(%1), %%ecx\n"
sl@0
   796
      "  movd %%ecx, %%mm5\n"
sl@0
   797
      "  pshufw $0x44, %%mm5, %%mm5\n"
sl@0
   798
      :: "r" (s3_4), "r" (s4_2)
sl@0
   799
      : "ecx"
sl@0
   800
      );
sl@0
   801
  asm volatile ("\n"
sl@0
   802
      "1:\n"
sl@0
   803
      "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
sl@0
   804
      "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
sl@0
   805
      "  pshufw $0x00, %%mm7, %%mm6\n"
sl@0
   806
      "  pmullw %%mm6, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
sl@0
   807
      "  pmulhw %%mm6, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
sl@0
   808
      "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
sl@0
   809
      "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
sl@0
   810
      "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
sl@0
   811
      "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
sl@0
   812
sl@0
   813
      "  movq 2(%2), %%mm2\n"
sl@0
   814
      "  movq 2(%2), %%mm3\n"
sl@0
   815
      "  pshufw $0x55, %%mm7, %%mm6\n"
sl@0
   816
      "  pmullw %%mm6, %%mm2\n"
sl@0
   817
      "  pmulhw %%mm6, %%mm3\n"
sl@0
   818
      "  movq %%mm2, %%mm4\n"
sl@0
   819
      "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
sl@0
   820
      "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
sl@0
   821
      "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
sl@0
   822
      "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
sl@0
   823
      "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
sl@0
   824
sl@0
   825
      "  movq 4(%2), %%mm2\n"
sl@0
   826
      "  movq 4(%2), %%mm3\n"
sl@0
   827
      "  pshufw $0xaa, %%mm7, %%mm6\n"
sl@0
   828
      "  pmullw %%mm6, %%mm2\n"
sl@0
   829
      "  pmulhw %%mm6, %%mm3\n"
sl@0
   830
      "  movq %%mm2, %%mm4\n"
sl@0
   831
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
   832
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
   833
      "  movq %%mm4, %%mm3\n"
sl@0
   834
      "  paddd %%mm3, %%mm1\n"
sl@0
   835
      "  paddd %%mm2, %%mm0\n"
sl@0
   836
sl@0
   837
      "  movq 6(%2), %%mm2\n"
sl@0
   838
      "  movq 6(%2), %%mm3\n"
sl@0
   839
      "  pshufw $0xff, %%mm7, %%mm6\n"
sl@0
   840
      "  pmullw %%mm6, %%mm2\n"
sl@0
   841
      "  pmulhw %%mm6, %%mm3\n"
sl@0
   842
      "  movq %%mm2, %%mm4\n"
sl@0
   843
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
   844
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
   845
      "  movq %%mm4, %%mm3\n"
sl@0
   846
      "  paddd %%mm3, %%mm1\n"
sl@0
   847
      "  paddd %%mm2, %%mm0\n"
sl@0
   848
sl@0
   849
      "  paddd %%mm5, %%mm1\n"
sl@0
   850
      "  paddd %%mm5, %%mm0\n"
sl@0
   851
sl@0
   852
      "  movd %4, %%mm4\n"
sl@0
   853
      "  psrad %%mm4, %%mm1\n"
sl@0
   854
      "  psrad %%mm4, %%mm0\n"
sl@0
   855
sl@0
   856
      "  packssdw %%mm1, %%mm0\n"
sl@0
   857
      "  paddw 0(%1), %%mm0\n"
sl@0
   858
      "  movq %%mm0, 0(%0)\n"
sl@0
   859
      "  add $8, %0\n"
sl@0
   860
      "  add $8, %1\n"
sl@0
   861
      "  add $8, %2\n"
sl@0
   862
      "  decl %3\n"
sl@0
   863
      "  jnz 1b\n"
sl@0
   864
      "  emms\n"
sl@0
   865
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
   866
      : "r" (shift)
sl@0
   867
      );
sl@0
   868
#if 0
sl@0
   869
  while (m) {
sl@0
   870
    int x;
sl@0
   871
    int i;
sl@0
   872
sl@0
   873
    x = s4_2[0];
sl@0
   874
    for(i=0;i<4;i++){
sl@0
   875
      x += s2[i]*s3_4[i];
sl@0
   876
    }
sl@0
   877
    x >>= s4_2[1];
sl@0
   878
    d1[0] = s1[0] + x;
sl@0
   879
sl@0
   880
    d1++;
sl@0
   881
    s1++;
sl@0
   882
    s2++;
sl@0
   883
    m--;
sl@0
   884
  }
sl@0
   885
#endif
sl@0
   886
}
sl@0
   887
OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
   888
sl@0
   889
#if 0
sl@0
   890
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
sl@0
   891
void
sl@0
   892
mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
   893
    int16_t *s4_2, int n)
sl@0
   894
{
sl@0
   895
  while (n&3) {
sl@0
   896
    int x;
sl@0
   897
sl@0
   898
    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
sl@0
   899
    x >>= s4_2[1];
sl@0
   900
    d1[0] = s1[0] + x;
sl@0
   901
sl@0
   902
    d1++;
sl@0
   903
    s1++;
sl@0
   904
    s2++;
sl@0
   905
    n--;
sl@0
   906
  }
sl@0
   907
  if (n==0) return;
sl@0
   908
sl@0
   909
  n>>=2;
sl@0
   910
  asm volatile ("\n"
sl@0
   911
      "  movzwl 0(%0), %%ecx\n"
sl@0
   912
      "  movd %%ecx, %%mm7\n"
sl@0
   913
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
   914
      "  movzwl 2(%0), %%ecx\n"
sl@0
   915
      "  movd %%ecx, %%mm6\n"
sl@0
   916
      "  pshufw $0x00, %%mm6, %%mm6\n"
sl@0
   917
      "  movzwl 0(%1), %%ecx\n"
sl@0
   918
      "  movd %%ecx, %%mm5\n"
sl@0
   919
      "  pshufw $0x00, %%mm5, %%mm5\n"
sl@0
   920
      "  movzwl 2(%1), %%ecx\n"
sl@0
   921
      "  movd %%ecx, %%mm4\n"
sl@0
   922
      :: "r" (s3_2), "r" (s4_2)
sl@0
   923
      : "ecx"
sl@0
   924
      );
sl@0
   925
  asm volatile ("\n"
sl@0
   926
      "1:\n"
sl@0
   927
      "  movq 0(%2), %%mm0\n"
sl@0
   928
      "  pmullw %%mm7, %%mm0\n"
sl@0
   929
      "  movq 2(%2), %%mm1\n"
sl@0
   930
      "  pmullw %%mm6, %%mm1\n"
sl@0
   931
      "  paddw %%mm1, %%mm0\n"
sl@0
   932
      "  paddw %%mm5, %%mm0\n"
sl@0
   933
      "  psraw %%mm4, %%mm0\n"
sl@0
   934
      "  paddw 0(%1), %%mm0\n"
sl@0
   935
      "  movq %%mm0, 0(%0)\n"
sl@0
   936
      "  add $8, %0\n"
sl@0
   937
      "  add $8, %1\n"
sl@0
   938
      "  add $8, %2\n"
sl@0
   939
      "  decl %3\n"
sl@0
   940
      "  jnz 1b\n"
sl@0
   941
      "  emms\n"
sl@0
   942
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
   943
      );
sl@0
   944
}
sl@0
   945
OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
   946
#endif
sl@0
   947
sl@0
   948
sl@0
   949
#if 0
sl@0
   950
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
sl@0
   951
void
sl@0
   952
mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
   953
    int16_t *s4_2, int n)
sl@0
   954
{
sl@0
   955
  while (n&3) {
sl@0
   956
    int x;
sl@0
   957
sl@0
   958
    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1] + 
sl@0
   959
      s2[2]*s3_2[2] + s2[2]*s3_2[2];
sl@0
   960
    x >>= s4_2[1];
sl@0
   961
    d1[0] = s1[0] + x;
sl@0
   962
sl@0
   963
    d1++;
sl@0
   964
    s1++;
sl@0
   965
    s2++;
sl@0
   966
    n--;
sl@0
   967
  }
sl@0
   968
  if (n==0) return;
sl@0
   969
sl@0
   970
  n>>=2;
sl@0
   971
  asm volatile ("\n"
sl@0
   972
      "  movzwl 0(%0), %%ecx\n"
sl@0
   973
      "  movd %%ecx, %%mm7\n"
sl@0
   974
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
   975
      "  movzwl 2(%0), %%ecx\n"
sl@0
   976
      "  movd %%ecx, %%mm6\n"
sl@0
   977
      "  pshufw $0x00, %%mm6, %%mm6\n"
sl@0
   978
      "  movzwl 2(%0), %%ecx\n"
sl@0
   979
      "  movd %%ecx, %%mm5\n"
sl@0
   980
      "  pshufw $0x00, %%mm5, %%mm5\n"
sl@0
   981
      "  movzwl 2(%0), %%ecx\n"
sl@0
   982
      "  movd %%ecx, %%mm4\n"
sl@0
   983
      "  pshufw $0x00, %%mm4, %%mm4\n"
sl@0
   984
      "  movzwl 0(%1), %%ecx\n"
sl@0
   985
      "  movd %%ecx, %%mm3\n"
sl@0
   986
      "  pshufw $0x00, %%mm3, %%mm3\n"
sl@0
   987
      "  movzwl 2(%1), %%ecx\n"
sl@0
   988
      "  movd %%ecx, %%mm2\n"
sl@0
   989
      :: "r" (s3_2), "r" (s4_2)
sl@0
   990
      : "ecx"
sl@0
   991
      );
sl@0
   992
  asm volatile ("\n"
sl@0
   993
      "1:\n"
sl@0
   994
      "  movq 0(%2), %%mm0\n"
sl@0
   995
      "  pmullw %%mm7, %%mm0\n"
sl@0
   996
      "  movq 2(%2), %%mm1\n"
sl@0
   997
      "  pmullw %%mm6, %%mm1\n"
sl@0
   998
      "  paddw %%mm1, %%mm0\n"
sl@0
   999
      "  movq 4(%2), %%mm1\n"
sl@0
  1000
      "  pmullw %%mm5, %%mm1\n"
sl@0
  1001
      "  paddw %%mm1, %%mm0\n"
sl@0
  1002
      "  movq 6(%2), %%mm1\n"
sl@0
  1003
      "  pmullw %%mm4, %%mm1\n"
sl@0
  1004
      "  paddw %%mm1, %%mm0\n"
sl@0
  1005
      "  paddw %%mm3, %%mm0\n"
sl@0
  1006
      "  psraw %%mm2, %%mm0\n"
sl@0
  1007
      "  paddw 0(%1), %%mm0\n"
sl@0
  1008
      "  movq %%mm0, 0(%0)\n"
sl@0
  1009
      "  add $8, %0\n"
sl@0
  1010
      "  add $8, %1\n"
sl@0
  1011
      "  add $8, %2\n"
sl@0
  1012
      "  decl %3\n"
sl@0
  1013
      "  jnz 1b\n"
sl@0
  1014
      "  emms\n"
sl@0
  1015
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1016
      );
sl@0
  1017
}
sl@0
  1018
OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1019
#endif
sl@0
  1020
sl@0
  1021
sl@0
  1022
#if 0
sl@0
  1023
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
sl@0
  1024
void
sl@0
  1025
mas8_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1026
    int16_t *s4_2, int n)
sl@0
  1027
{
sl@0
  1028
  while (n&3) {
sl@0
  1029
    int x;
sl@0
  1030
    int i;
sl@0
  1031
sl@0
  1032
    x = s4_2[0];
sl@0
  1033
    for(i=0;i<8;i++){
sl@0
  1034
      x += s2[i]*s3_2[i];
sl@0
  1035
    }
sl@0
  1036
    x >>= s4_2[1];
sl@0
  1037
    d1[0] = s1[0] + x;
sl@0
  1038
sl@0
  1039
    d1++;
sl@0
  1040
    s1++;
sl@0
  1041
    s2++;
sl@0
  1042
    n--;
sl@0
  1043
  }
sl@0
  1044
  if (n==0) return;
sl@0
  1045
sl@0
  1046
  n>>=2;
sl@0
  1047
  asm volatile ("\n"
sl@0
  1048
      "  movq 0(%0), %%mm6\n"
sl@0
  1049
      "  movq 8(%0), %%mm7\n"
sl@0
  1050
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1051
      "  movd %%ecx, %%mm3\n"
sl@0
  1052
      "  pshufw $0x00, %%mm3, %%mm3\n"
sl@0
  1053
      "  pxor %%mm4, %%mm4\n"
sl@0
  1054
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1055
      "  movd %%ecx, %%mm4\n"
sl@0
  1056
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1057
      : "ecx"
sl@0
  1058
      );
sl@0
  1059
  asm volatile ("\n"
sl@0
  1060
      "1:\n"
sl@0
  1061
      "  pshufw $0x00, %%mm6, %%mm1\n"
sl@0
  1062
      "  movq 0(%2), %%mm0\n"
sl@0
  1063
      "  pmullw %%mm1, %%mm0\n"
sl@0
  1064
      "  pshufw $0x55, %%mm6, %%mm2\n"
sl@0
  1065
      "  movq 2(%2), %%mm1\n"
sl@0
  1066
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1067
      "  paddw %%mm1, %%mm0\n"
sl@0
  1068
      "  pshufw $0xaa, %%mm6, %%mm2\n"
sl@0
  1069
      "  movq 4(%2), %%mm1\n"
sl@0
  1070
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1071
      "  paddw %%mm1, %%mm0\n"
sl@0
  1072
      "  pshufw $0xff, %%mm6, %%mm2\n"
sl@0
  1073
      "  movq 6(%2), %%mm1\n"
sl@0
  1074
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1075
      "  paddw %%mm1, %%mm0\n"
sl@0
  1076
sl@0
  1077
      "  pshufw $0x00, %%mm7, %%mm2\n"
sl@0
  1078
      "  movq 8(%2), %%mm1\n"
sl@0
  1079
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1080
      "  paddw %%mm1, %%mm0\n"
sl@0
  1081
      "  pshufw $0x55, %%mm7, %%mm2\n"
sl@0
  1082
      "  movq 10(%2), %%mm1\n"
sl@0
  1083
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1084
      "  paddw %%mm1, %%mm0\n"
sl@0
  1085
      "  pshufw $0xaa, %%mm7, %%mm2\n"
sl@0
  1086
      "  movq 12(%2), %%mm1\n"
sl@0
  1087
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1088
      "  paddw %%mm1, %%mm0\n"
sl@0
  1089
      "  pshufw $0xff, %%mm7, %%mm2\n"
sl@0
  1090
      "  movq 14(%2), %%mm1\n"
sl@0
  1091
      "  pmullw %%mm2, %%mm1\n"
sl@0
  1092
      "  paddw %%mm1, %%mm0\n"
sl@0
  1093
sl@0
  1094
      "  paddw %%mm3, %%mm0\n"
sl@0
  1095
      "  psraw %%mm4, %%mm0\n"
sl@0
  1096
      "  paddw 0(%1), %%mm0\n"
sl@0
  1097
      "  movq %%mm0, 0(%0)\n"
sl@0
  1098
      "  add $8, %0\n"
sl@0
  1099
      "  add $8, %1\n"
sl@0
  1100
      "  add $8, %2\n"
sl@0
  1101
      "  decl %3\n"
sl@0
  1102
      "  jnz 1b\n"
sl@0
  1103
      "  emms\n"
sl@0
  1104
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1105
      );
sl@0
  1106
}
sl@0
  1107
OIL_DEFINE_IMPL_FULL (mas8_add_s16_mmx, mas8_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1108
#endif
sl@0
  1109
sl@0
  1110
sl@0
  1111
void
sl@0
  1112
mas4_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1113
    int16_t *s4_2, int n)
sl@0
  1114
{
sl@0
  1115
  if (n==0) return;
sl@0
  1116
  asm volatile ("\n"
sl@0
  1117
      "  movq 0(%0), %%mm6\n"
sl@0
  1118
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1119
      "  movd %%ecx, %%mm3\n"
sl@0
  1120
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1121
      "  movd %%ecx, %%mm4\n"
sl@0
  1122
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1123
      : "ecx"
sl@0
  1124
      );
sl@0
  1125
  asm volatile ("\n"
sl@0
  1126
      "1:\n"
sl@0
  1127
      "  movq 0(%2), %%mm0\n"
sl@0
  1128
      "  pmaddwd %%mm6, %%mm0\n"
sl@0
  1129
      "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
sl@0
  1130
      "  paddd %%mm1, %%mm0\n"
sl@0
  1131
      "  paddd %%mm3, %%mm0\n"
sl@0
  1132
      "  psrad %%mm4, %%mm0\n"
sl@0
  1133
      "  movd %%mm0, %%eax\n"
sl@0
  1134
      "  addw 0(%1), %%ax\n"
sl@0
  1135
      "  movw %%ax, 0(%0)\n"
sl@0
  1136
      "  add $2, %0\n"
sl@0
  1137
      "  add $2, %1\n"
sl@0
  1138
      "  add $2, %2\n"
sl@0
  1139
      "  decl %3\n"
sl@0
  1140
      "  jnz 1b\n"
sl@0
  1141
      "  emms\n"
sl@0
  1142
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1143
      :
sl@0
  1144
      : "eax"
sl@0
  1145
      );
sl@0
  1146
}
sl@0
  1147
OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1148
sl@0
  1149
void
sl@0
  1150
mas4_add_s16_pmaddwd_2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1151
    int16_t *s4_2, int n)
sl@0
  1152
{
sl@0
  1153
  if (n==0) return;
sl@0
  1154
  asm volatile ("\n"
sl@0
  1155
      "  movq 0(%0), %%mm6\n"
sl@0
  1156
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1157
      "  movd %%ecx, %%mm3\n"
sl@0
  1158
      "  pshufw $0x44, %%mm3, %%mm3\n" // 01 00 01 00
sl@0
  1159
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1160
      "  movd %%ecx, %%mm4\n"
sl@0
  1161
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1162
      : "ecx"
sl@0
  1163
      );
sl@0
  1164
  if (n&1) {
sl@0
  1165
    asm volatile ("\n"
sl@0
  1166
        "  movq 0(%2), %%mm0\n"
sl@0
  1167
        "  pmaddwd %%mm6, %%mm0\n"
sl@0
  1168
        "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
sl@0
  1169
        "  paddd %%mm1, %%mm0\n"
sl@0
  1170
        "  paddd %%mm3, %%mm0\n"
sl@0
  1171
        "  psrad %%mm4, %%mm0\n"
sl@0
  1172
        "  movd %%mm0, %%eax\n"
sl@0
  1173
        "  addw 0(%1), %%ax\n"
sl@0
  1174
        "  movw %%ax, 0(%0)\n"
sl@0
  1175
        "  add $2, %0\n"
sl@0
  1176
        "  add $2, %1\n"
sl@0
  1177
        "  add $2, %2\n"
sl@0
  1178
        "  decl %3\n"
sl@0
  1179
        : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1180
        :
sl@0
  1181
        : "eax"
sl@0
  1182
        );
sl@0
  1183
  }
sl@0
  1184
  n>>=1;
sl@0
  1185
  asm volatile ("\n"
sl@0
  1186
      "1:\n"
sl@0
  1187
      "  movq 0(%2), %%mm0\n"
sl@0
  1188
      "  pmaddwd %%mm6, %%mm0\n"
sl@0
  1189
      "  movq 2(%2), %%mm2\n"
sl@0
  1190
      "  pmaddwd %%mm6, %%mm2\n"
sl@0
  1191
sl@0
  1192
      "  movq %%mm0, %%mm1\n"
sl@0
  1193
      "  punpckhdq %%mm2, %%mm0\n"
sl@0
  1194
      "  punpckldq %%mm2, %%mm1\n"
sl@0
  1195
sl@0
  1196
      "  paddd %%mm1, %%mm0\n"
sl@0
  1197
      "  paddd %%mm3, %%mm0\n"
sl@0
  1198
      "  psrad %%mm4, %%mm0\n"
sl@0
  1199
      "  pshufw $0xd8, %%mm0, %%mm0\n" // 11 01 10 00
sl@0
  1200
sl@0
  1201
      "  paddw 0(%1), %%mm0\n"
sl@0
  1202
      "  movd %%mm0, 0(%0)\n"
sl@0
  1203
      "  add $4, %0\n"
sl@0
  1204
      "  add $4, %1\n"
sl@0
  1205
      "  add $4, %2\n"
sl@0
  1206
      "  decl %3\n"
sl@0
  1207
      "  jnz 1b\n"
sl@0
  1208
      "  emms\n"
sl@0
  1209
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1210
      :
sl@0
  1211
      : "eax"
sl@0
  1212
      );
sl@0
  1213
}
sl@0
  1214
OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd_2, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1215
sl@0
  1216
void
sl@0
  1217
mas8_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1218
    int16_t *s4_2, int n)
sl@0
  1219
{
sl@0
  1220
  if (n==0) return;
sl@0
  1221
  asm volatile ("\n"
sl@0
  1222
      "  movq 0(%0), %%mm6\n"
sl@0
  1223
      "  movq 8(%0), %%mm7\n"
sl@0
  1224
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1225
      "  movd %%ecx, %%mm3\n"
sl@0
  1226
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1227
      "  movd %%ecx, %%mm4\n"
sl@0
  1228
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1229
      : "ecx"
sl@0
  1230
      );
sl@0
  1231
  asm volatile ("\n"
sl@0
  1232
      "1:\n"
sl@0
  1233
      "  movq 0(%2), %%mm0\n"
sl@0
  1234
      "  pmaddwd %%mm6, %%mm0\n"
sl@0
  1235
      "  movq 8(%2), %%mm1\n"
sl@0
  1236
      "  pmaddwd %%mm7, %%mm1\n"
sl@0
  1237
      "  paddd %%mm1, %%mm0\n"
sl@0
  1238
      "  pshufw $0xee, %%mm0, %%mm1\n"
sl@0
  1239
      "  paddd %%mm1, %%mm0\n"
sl@0
  1240
      "  paddd %%mm3, %%mm0\n"
sl@0
  1241
      "  psrad %%mm4, %%mm0\n"
sl@0
  1242
      "  movd %%mm0, %%eax\n"
sl@0
  1243
      "  addw 0(%1), %%ax\n"
sl@0
  1244
      "  movw %%ax, 0(%0)\n"
sl@0
  1245
      "  add $2, %0\n"
sl@0
  1246
      "  add $2, %1\n"
sl@0
  1247
      "  add $2, %2\n"
sl@0
  1248
      "  decl %3\n"
sl@0
  1249
      "  jnz 1b\n"
sl@0
  1250
      "  emms\n"
sl@0
  1251
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1252
      :
sl@0
  1253
      : "eax"
sl@0
  1254
      );
sl@0
  1255
}
sl@0
  1256
OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd, mas8_add_s16, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
sl@0
  1257
sl@0
  1258
sl@0
  1259
sl@0
  1260
#if 0
sl@0
  1261
void
sl@0
  1262
mas8_add_s16_pmaddwd2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1263
    int16_t *s4_2, int n)
sl@0
  1264
{
sl@0
  1265
  while (n&3) {
sl@0
  1266
    int x;
sl@0
  1267
    int i;
sl@0
  1268
sl@0
  1269
    x = s4_2[0];
sl@0
  1270
    for(i=0;i<8;i++){
sl@0
  1271
      x += s2[i]*s3_2[i];
sl@0
  1272
    }
sl@0
  1273
    x >>= s4_2[1];
sl@0
  1274
    d1[0] = s1[0] + x;
sl@0
  1275
sl@0
  1276
    d1++;
sl@0
  1277
    s1++;
sl@0
  1278
    s2++;
sl@0
  1279
    n--;
sl@0
  1280
  }
sl@0
  1281
  if (n==0) return;
sl@0
  1282
sl@0
  1283
  n>>=2;
sl@0
  1284
  asm volatile ("\n"
sl@0
  1285
      "  movq 0(%0), %%mm6\n"
sl@0
  1286
      "  movq 8(%0), %%mm7\n"
sl@0
  1287
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1288
      "  movd %%ecx, %%mm5\n"
sl@0
  1289
      "  pshufw $0x00, %%mm5, %%mm5\n"
sl@0
  1290
      "  pxor %%mm4, %%mm4\n"
sl@0
  1291
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1292
      "  movd %%ecx, %%mm4\n"
sl@0
  1293
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1294
      : "ecx"
sl@0
  1295
      );
sl@0
  1296
  asm volatile ("\n"
sl@0
  1297
      "1:\n"
sl@0
  1298
      "  movq 0(%2), %%mm0\n"
sl@0
  1299
      "  pmaddwd %%mm6, %%mm0\n"
sl@0
  1300
      "  movq 8(%2), %%mm1\n"
sl@0
  1301
      "  pmaddwd %%mm7, %%mm1\n"
sl@0
  1302
      "  paddd %%mm1, %%mm0\n"
sl@0
  1303
      "  pshufw $0xee, %%mm0, %%mm1\n"
sl@0
  1304
      "  paddw %%mm1, %%mm0\n"
sl@0
  1305
sl@0
  1306
      "  movq 2(%2), %%mm2\n"
sl@0
  1307
      "  pmaddwd %%mm6, %%mm2\n"
sl@0
  1308
      "  movq 10(%2), %%mm3\n"
sl@0
  1309
      "  pmaddwd %%mm7, %%mm3\n"
sl@0
  1310
      "  paddd %%mm3, %%mm2\n"
sl@0
  1311
      "  pshufw $0xee, %%mm2, %%mm3\n"
sl@0
  1312
      "  paddw %%mm3, %%mm2\n"
sl@0
  1313
      "  pextrw $0, %%mm2, %%eax\n"
sl@0
  1314
      "  pinsrw $1, %%eax, %%mm0\n"
sl@0
  1315
sl@0
  1316
      "  movq 4(%2), %%mm2\n"
sl@0
  1317
      "  pmaddwd %%mm6, %%mm2\n"
sl@0
  1318
      "  movq 12(%2), %%mm3\n"
sl@0
  1319
      "  pmaddwd %%mm7, %%mm3\n"
sl@0
  1320
      "  paddd %%mm3, %%mm2\n"
sl@0
  1321
      "  pshufw $0xee, %%mm2, %%mm3\n"
sl@0
  1322
      "  paddw %%mm3, %%mm2\n"
sl@0
  1323
      "  pextrw $0, %%mm2, %%eax\n"
sl@0
  1324
      "  pinsrw $2, %%eax, %%mm0\n"
sl@0
  1325
sl@0
  1326
      "  movq 6(%2), %%mm2\n"
sl@0
  1327
      "  pmaddwd %%mm6, %%mm2\n"
sl@0
  1328
      "  movq 14(%2), %%mm3\n"
sl@0
  1329
      "  pmaddwd %%mm7, %%mm3\n"
sl@0
  1330
      "  paddd %%mm3, %%mm2\n"
sl@0
  1331
      "  pshufw $0xee, %%mm2, %%mm3\n"
sl@0
  1332
      "  paddw %%mm3, %%mm2\n"
sl@0
  1333
      "  pextrw $0, %%mm2, %%eax\n"
sl@0
  1334
      "  pinsrw $3, %%eax, %%mm0\n"
sl@0
  1335
sl@0
  1336
      "  paddw %%mm5, %%mm0\n"
sl@0
  1337
      "  psraw %%mm4, %%mm0\n"
sl@0
  1338
      "  paddw 0(%1), %%mm0\n"
sl@0
  1339
      "  movq %%mm0, 0(%0)\n"
sl@0
  1340
      "  add $8, %0\n"
sl@0
  1341
      "  add $8, %1\n"
sl@0
  1342
      "  add $8, %2\n"
sl@0
  1343
      "  decl %3\n"
sl@0
  1344
      "  jnz 1b\n"
sl@0
  1345
      "  emms\n"
sl@0
  1346
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1347
      :
sl@0
  1348
      : "eax"
sl@0
  1349
      );
sl@0
  1350
}
sl@0
  1351
OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
sl@0
  1352
#endif
sl@0
  1353
sl@0
  1354
#if 0
sl@0
  1355
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
sl@0
  1356
void
sl@0
  1357
mas8_add_s16_sse2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
sl@0
  1358
    int16_t *s4_2, int n)
sl@0
  1359
{
sl@0
  1360
  asm volatile ("\n"
sl@0
  1361
      "  movq 0(%0), %%mm6\n"
sl@0
  1362
      "  movq 8(%0), %%mm7\n"
sl@0
  1363
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1364
      "  movd %%ecx, %%mm3\n"
sl@0
  1365
      "  pshufw $0x00, %%mm3, %%mm3\n"
sl@0
  1366
      "  pxor %%mm4, %%mm4\n"
sl@0
  1367
      "  movzwl 2(%1), %%ecx\n"
sl@0
  1368
      "  movd %%ecx, %%mm4\n"
sl@0
  1369
      :: "r" (s3_2), "r" (s4_2)
sl@0
  1370
      : "ecx"
sl@0
  1371
      );
sl@0
  1372
  asm volatile ("\n"
sl@0
  1373
      "1:\n"
sl@0
  1374
      "  movq 0(%2), %%mm0\n"
sl@0
  1375
      "  pmullw %%mm6, %%mm0\n"
sl@0
  1376
      "  movq 8(%2), %%mm1\n"
sl@0
  1377
      "  pmullw %%mm7, %%mm1\n"
sl@0
  1378
      "  paddw %%mm1, %%mm0\n"
sl@0
  1379
      "  pshufw $0xee, %%mm0, %%mm1\n"
sl@0
  1380
      "  paddw %%mm1, %%mm0\n"
sl@0
  1381
      "  pshufw $0x01, %%mm0, %%mm1\n"
sl@0
  1382
      "  paddw %%mm1, %%mm0\n"
sl@0
  1383
      "  paddw %%mm3, %%mm0\n"
sl@0
  1384
      "  psraw %%mm4, %%mm0\n"
sl@0
  1385
      "  movd %%mm0, %%eax\n"
sl@0
  1386
      "  addw 0(%1), %%ax\n"
sl@0
  1387
      "  movw %%ax, 0(%0)\n"
sl@0
  1388
      "  add $2, %0\n"
sl@0
  1389
      "  add $2, %1\n"
sl@0
  1390
      "  add $2, %2\n"
sl@0
  1391
      "  decl %3\n"
sl@0
  1392
      "  jnz 1b\n"
sl@0
  1393
      "  emms\n"
sl@0
  1394
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1395
      :
sl@0
  1396
      : "eax"
sl@0
  1397
      );
sl@0
  1398
}
sl@0
  1399
OIL_DEFINE_IMPL_FULL (mas8_add_s16_sse2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
sl@0
  1400
#endif
sl@0
  1401
sl@0
  1402
void
sl@0
  1403
mas2_across_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
sl@0
  1404
    int16_t *s4_2, int16_t *s5_2, int n)
sl@0
  1405
{
sl@0
  1406
  int shift = s5_2[1];
sl@0
  1407
sl@0
  1408
  while (n&3) {
sl@0
  1409
    int x;
sl@0
  1410
sl@0
  1411
    x = s5_2[0] + s2[0]*s4_2[0] + s3[0]*s4_2[1];
sl@0
  1412
    x >>= s5_2[1];
sl@0
  1413
    d1[0] = s1[0] + x;
sl@0
  1414
sl@0
  1415
    d1++;
sl@0
  1416
    s1++;
sl@0
  1417
    s2++;
sl@0
  1418
    s3++;
sl@0
  1419
    n--;
sl@0
  1420
  }
sl@0
  1421
  if (n==0) return;
sl@0
  1422
sl@0
  1423
  n>>=2;
sl@0
  1424
  if (n==0) return;
sl@0
  1425
  asm volatile ("\n"
sl@0
  1426
      "  movzwl 0(%0), %%ecx\n"
sl@0
  1427
      "  movd %%ecx, %%mm7\n"
sl@0
  1428
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
  1429
      "  movzwl 2(%0), %%ecx\n"
sl@0
  1430
      "  movd %%ecx, %%mm6\n"
sl@0
  1431
      "  pshufw $0x00, %%mm6, %%mm6\n"
sl@0
  1432
      "  movzwl 0(%1), %%ecx\n"
sl@0
  1433
      "  movd %%ecx, %%mm5\n"
sl@0
  1434
      "  pshufw $0x44, %%mm5, %%mm5\n"
sl@0
  1435
      :: "r" (s4_2), "r" (s5_2)
sl@0
  1436
      : "ecx"
sl@0
  1437
      );
sl@0
  1438
  asm volatile ("\n"
sl@0
  1439
      "1:\n"
sl@0
  1440
      "  movq 0(%2), %%mm0\n"       // mm0 = s0, s1, s2, s3
sl@0
  1441
      "  movq 0(%2), %%mm1\n"       // mm1 = s0, s1, s2, s3
sl@0
  1442
      "  pmullw %%mm7, %%mm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
sl@0
  1443
      "  pmulhw %%mm7, %%mm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
sl@0
  1444
      "  movq %%mm0, %%mm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
sl@0
  1445
      "  punpcklwd %%mm1, %%mm0\n"  // mm0 = s0*a0, s1*a0
sl@0
  1446
      "  punpckhwd %%mm1, %%mm2\n"  // mm2 = s2*a0, s3*a0
sl@0
  1447
      "  movq %%mm2, %%mm1\n"       // mm1 = s2*a0, s3*a0
sl@0
  1448
sl@0
  1449
      "  movq 0(%3), %%mm2\n"
sl@0
  1450
      "  movq 0(%3), %%mm3\n"
sl@0
  1451
      "  pmullw %%mm6, %%mm2\n"
sl@0
  1452
      "  pmulhw %%mm6, %%mm3\n"
sl@0
  1453
      "  movq %%mm2, %%mm4\n"
sl@0
  1454
      "  punpcklwd %%mm3, %%mm2\n"  // mm2 = s1*a1, s2*a1
sl@0
  1455
      "  punpckhwd %%mm3, %%mm4\n"  // mm4 = s3*a1, s4*a1
sl@0
  1456
      "  movq %%mm4, %%mm3\n"       // mm3 = s3*a1, s4*a1
sl@0
  1457
sl@0
  1458
      "  paddd %%mm3, %%mm1\n"      // mm1 = s2*a0 + s3*a1, ...
sl@0
  1459
      "  paddd %%mm2, %%mm0\n"      // mm0 = s0*a0 + s1*a1, ...
sl@0
  1460
sl@0
  1461
      "  paddd %%mm5, %%mm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
sl@0
  1462
      "  paddd %%mm5, %%mm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
sl@0
  1463
sl@0
  1464
      "  movd %5, %%mm4\n"
sl@0
  1465
      "  psrad %%mm4, %%mm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
sl@0
  1466
      "  psrad %%mm4, %%mm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
sl@0
  1467
sl@0
  1468
      "  packssdw %%mm1, %%mm0\n"
sl@0
  1469
      "  paddw 0(%1), %%mm0\n"
sl@0
  1470
      "  movq %%mm0, 0(%0)\n"
sl@0
  1471
      "  add $8, %0\n"
sl@0
  1472
      "  add $8, %1\n"
sl@0
  1473
      "  add $8, %2\n"
sl@0
  1474
      "  add $8, %3\n"
sl@0
  1475
      "  decl %4\n"
sl@0
  1476
      "  jnz 1b\n"
sl@0
  1477
      "  emms\n"
sl@0
  1478
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+m" (n)
sl@0
  1479
      : "r" (shift)
sl@0
  1480
      );
sl@0
  1481
}
sl@0
  1482
OIL_DEFINE_IMPL_FULL (mas2_across_add_s16_mmx, mas2_across_add_s16,
sl@0
  1483
    OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1484
sl@0
  1485
void
sl@0
  1486
add_const_rshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2_2, int n)
sl@0
  1487
{
sl@0
  1488
  while(n&3) {
sl@0
  1489
    d1[0] = (s1[0] + s2_2[0])>>s2_2[1];
sl@0
  1490
    d1++;
sl@0
  1491
    s1++;
sl@0
  1492
    n--;
sl@0
  1493
  }
sl@0
  1494
  n>>=2;
sl@0
  1495
  if (n==0) return;
sl@0
  1496
  asm volatile ("\n"
sl@0
  1497
      "  movzwl 0(%2), %%ecx\n"
sl@0
  1498
      "  movd %%ecx, %%mm7\n"
sl@0
  1499
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
  1500
      "  movzwl 2(%2), %%ecx\n"
sl@0
  1501
      "  movd %%ecx, %%mm6\n"
sl@0
  1502
      "1:\n"
sl@0
  1503
      "  movq 0(%1), %%mm0\n"
sl@0
  1504
      "  paddsw %%mm7, %%mm0\n"
sl@0
  1505
      "  psraw %%mm6, %%mm0\n"
sl@0
  1506
      "  movq %%mm0, 0(%0)\n"
sl@0
  1507
      "  add $8, %0\n"
sl@0
  1508
      "  add $8, %1\n"
sl@0
  1509
      "  decl %3\n"
sl@0
  1510
      "  jnz 1b\n"
sl@0
  1511
      "  emms\n"
sl@0
  1512
      : "+r" (d1), "+r" (s1), "+r" (s2_2), "+r" (n)
sl@0
  1513
      :
sl@0
  1514
      : "ecx"
sl@0
  1515
      );
sl@0
  1516
sl@0
  1517
}
sl@0
  1518
OIL_DEFINE_IMPL_FULL (add_const_rshift_s16_mmx, add_const_rshift_s16,
sl@0
  1519
    OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
sl@0
  1520
sl@0
  1521
void
sl@0
  1522
multiply_and_add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3, int n)
sl@0
  1523
{
sl@0
  1524
  while(n&3) {
sl@0
  1525
    d1[0] = s1[0] + s2[0]*s3[0];
sl@0
  1526
    d1++;
sl@0
  1527
    s1++;
sl@0
  1528
    s2++;
sl@0
  1529
    s3++;
sl@0
  1530
    n--;
sl@0
  1531
  }
sl@0
  1532
  n>>=2;
sl@0
  1533
  if (n==0) return;
sl@0
  1534
  asm volatile ("\n"
sl@0
  1535
      "1:\n"
sl@0
  1536
      "  movq 0(%2), %%mm0\n"
sl@0
  1537
      "  pmullw 0(%3), %%mm0\n"
sl@0
  1538
      "  paddw 0(%1), %%mm0\n"
sl@0
  1539
      "  movq %%mm0, 0(%0)\n"
sl@0
  1540
      "  add $8, %0\n"
sl@0
  1541
      "  add $8, %1\n"
sl@0
  1542
      "  add $8, %2\n"
sl@0
  1543
      "  add $8, %3\n"
sl@0
  1544
      "  decl %4\n"
sl@0
  1545
      "  jnz 1b\n"
sl@0
  1546
      "  emms\n"
sl@0
  1547
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
sl@0
  1548
      );
sl@0
  1549
sl@0
  1550
}
sl@0
  1551
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_mmx, multiply_and_add_s16,
sl@0
  1552
    OIL_IMPL_FLAG_MMX);
sl@0
  1553
sl@0
  1554
void
sl@0
  1555
multiply_and_add_s16_u8_mmx(int16_t *d1, int16_t *s1, int16_t *s2,
sl@0
  1556
    uint8_t *s3, int n)
sl@0
  1557
{
sl@0
  1558
  while(n&3) {
sl@0
  1559
    d1[0] = s1[0] + s2[0]*s3[0];
sl@0
  1560
    d1++;
sl@0
  1561
    s1++;
sl@0
  1562
    s2++;
sl@0
  1563
    s3++;
sl@0
  1564
    n--;
sl@0
  1565
  }
sl@0
  1566
  n>>=2;
sl@0
  1567
  if (n==0) return;
sl@0
  1568
  asm volatile ("\n"
sl@0
  1569
      "  pxor %%mm7, %%mm7\n"
sl@0
  1570
      "1:\n"
sl@0
  1571
      "  movd 0(%3), %%mm0\n"
sl@0
  1572
      "  punpcklbw %%mm7, %%mm0\n"
sl@0
  1573
      "  pmullw 0(%2), %%mm0\n"
sl@0
  1574
      "  paddw 0(%1), %%mm0\n"
sl@0
  1575
      "  movq %%mm0, 0(%0)\n"
sl@0
  1576
      "  add $8, %0\n"
sl@0
  1577
      "  add $8, %1\n"
sl@0
  1578
      "  add $8, %2\n"
sl@0
  1579
      "  add $4, %3\n"
sl@0
  1580
      "  decl %4\n"
sl@0
  1581
      "  jnz 1b\n"
sl@0
  1582
      "  emms\n"
sl@0
  1583
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
sl@0
  1584
      );
sl@0
  1585
sl@0
  1586
}
sl@0
  1587
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8,
sl@0
  1588
    OIL_IMPL_FLAG_MMX);
sl@0
  1589
sl@0
  1590
void
sl@0
  1591
multiply_and_add_s16_u8_mmx_2(int16_t *d1, int16_t *s1, int16_t *s2,
sl@0
  1592
    uint8_t *s3, int n)
sl@0
  1593
{
sl@0
  1594
  while(n&7) {
sl@0
  1595
    d1[0] = s1[0] + s2[0]*s3[0];
sl@0
  1596
    d1++;
sl@0
  1597
    s1++;
sl@0
  1598
    s2++;
sl@0
  1599
    s3++;
sl@0
  1600
    n--;
sl@0
  1601
  }
sl@0
  1602
  n>>=3;
sl@0
  1603
  if (n==0) return;
sl@0
  1604
  asm volatile ("\n"
sl@0
  1605
      "  pxor %%mm7, %%mm7\n"
sl@0
  1606
      "1:\n"
sl@0
  1607
      "  movd 0(%3), %%mm0\n"
sl@0
  1608
      "  punpcklbw %%mm7, %%mm0\n"
sl@0
  1609
      "   movd 4(%3), %%mm1\n"
sl@0
  1610
      "  pmullw 0(%2), %%mm0\n"
sl@0
  1611
      "   punpcklbw %%mm7, %%mm1\n"
sl@0
  1612
      "  paddw 0(%1), %%mm0\n"
sl@0
  1613
      "   pmullw 8(%2), %%mm1\n"
sl@0
  1614
      "  movq %%mm0, 0(%0)\n"
sl@0
  1615
      "   paddw 8(%1), %%mm1\n"
sl@0
  1616
      "   movq %%mm1, 8(%0)\n"
sl@0
  1617
sl@0
  1618
      "  add $16, %0\n"
sl@0
  1619
      "  add $16, %1\n"
sl@0
  1620
      "  add $16, %2\n"
sl@0
  1621
      "  add $8, %3\n"
sl@0
  1622
      "  decl %4\n"
sl@0
  1623
      "  jnz 1b\n"
sl@0
  1624
      "  emms\n"
sl@0
  1625
      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
sl@0
  1626
      );
sl@0
  1627
sl@0
  1628
}
sl@0
  1629
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8,
sl@0
  1630
    OIL_IMPL_FLAG_MMX);
sl@0
  1631
sl@0
  1632
void
sl@0
  1633
multiply_and_acc_12xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
sl@0
  1634
    int ss1, uint8_t *s2, int ss2, int n)
sl@0
  1635
{
sl@0
  1636
  if (n==0) return;
sl@0
  1637
  __asm__ __volatile__ ("\n"
sl@0
  1638
      "  pxor %%mm7, %%mm7\n"
sl@0
  1639
      "1:\n"
sl@0
  1640
      "  movd 0(%2), %%mm0\n"
sl@0
  1641
      "  punpcklbw %%mm7, %%mm0\n"
sl@0
  1642
      "  pmullw 0(%1), %%mm0\n"
sl@0
  1643
      "  paddw 0(%0), %%mm0\n"
sl@0
  1644
      "  movq %%mm0, 0(%0)\n"
sl@0
  1645
      "   movd 4(%2), %%mm1\n"
sl@0
  1646
      "   punpcklbw %%mm7, %%mm1\n"
sl@0
  1647
      "   pmullw 8(%1), %%mm1\n"
sl@0
  1648
      "   paddw 8(%0), %%mm1\n"
sl@0
  1649
      "   movq %%mm1, 8(%0)\n"
sl@0
  1650
      "    movd 8(%2), %%mm2\n"
sl@0
  1651
      "    punpcklbw %%mm7, %%mm2\n"
sl@0
  1652
      "    pmullw 16(%1), %%mm2\n"
sl@0
  1653
      "    paddw 16(%0), %%mm2\n"
sl@0
  1654
      "    movq %%mm2, 16(%0)\n"
sl@0
  1655
sl@0
  1656
      "  addl %4, %0\n"
sl@0
  1657
      "  addl %5, %1\n"
sl@0
  1658
      "  addl %6, %2\n"
sl@0
  1659
      "  decl %3\n"
sl@0
  1660
      "  jnz 1b\n"
sl@0
  1661
      "  emms\n"
sl@0
  1662
      : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
sl@0
  1663
      : "m" (is1), "m" (ss1), "m" (ss2)
sl@0
  1664
      );
sl@0
  1665
}
sl@0
  1666
OIL_DEFINE_IMPL_FULL (multiply_and_acc_12xn_s16_u8_mmx,
sl@0
  1667
    multiply_and_acc_12xn_s16_u8, OIL_IMPL_FLAG_MMX);
sl@0
  1668
sl@0
  1669
#ifdef ENABLE_BROKEN_IMPLS
sl@0
  1670
void
sl@0
  1671
mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
sl@0
  1672
    int16_t *s3_4, int16_t *s4_2, int n)
sl@0
  1673
{
sl@0
  1674
  int16_t *s2_nx4_off;
sl@0
  1675
sl@0
  1676
  while (n&3) {
sl@0
  1677
    int x;
sl@0
  1678
    int j;
sl@0
  1679
    x = s4_2[0];
sl@0
  1680
    for(j=0;j<4;j++){
sl@0
  1681
      x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
sl@0
  1682
    }
sl@0
  1683
    x >>= s4_2[1];
sl@0
  1684
    d[0] = s1[0] + x;
sl@0
  1685
sl@0
  1686
    n--;
sl@0
  1687
    d++;
sl@0
  1688
    s1++;
sl@0
  1689
    s2_nx4++;
sl@0
  1690
  }
sl@0
  1691
  if (n==0) return;
sl@0
  1692
sl@0
  1693
  s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
sl@0
  1694
sl@0
  1695
  n >>= 2;
sl@0
  1696
  __asm__ __volatile__ ("\n"
sl@0
  1697
      "  movq 0(%[s3_4]), %%mm0\n"
sl@0
  1698
      "  pshufw $0x55, %%mm0, %%mm1\n"
sl@0
  1699
      "  pshufw $0xaa, %%mm0, %%mm2\n"
sl@0
  1700
      "  pshufw $0xff, %%mm0, %%mm3\n"
sl@0
  1701
      "  pshufw $0x00, %%mm0, %%mm0\n"
sl@0
  1702
      "  movzwl 0(%[s4_2]), %%ecx\n"
sl@0
  1703
      "  movd %%ecx, %%mm7\n"
sl@0
  1704
      "  pshufw $0x00, %%mm7, %%mm7\n"
sl@0
  1705
      "  movzwl 2(%[s4_2]), %%ecx\n"
sl@0
  1706
      "  movd %%ecx, %%mm6\n"
sl@0
  1707
      :
sl@0
  1708
      : [s3_4] "r" (s3_4),
sl@0
  1709
        [s4_2] "r" (s4_2)
sl@0
  1710
      : "ecx"
sl@0
  1711
      );
sl@0
  1712
sl@0
  1713
  __asm__ __volatile__ ("\n"
sl@0
  1714
      "1:\n"
sl@0
  1715
      "  movq 0(%[s2_nx4]), %%mm4\n"
sl@0
  1716
      "  pmullw %%mm0, %%mm4\n"
sl@0
  1717
      "  movq (%[s2_nx4],%[sstr]), %%mm5\n"
sl@0
  1718
      "  pmullw %%mm1, %%mm5\n"
sl@0
  1719
      "  paddsw %%mm5,%%mm4\n"
sl@0
  1720
      "  movq (%[s2_nx4],%[sstr],2), %%mm5\n"
sl@0
  1721
      "  pmullw %%mm2, %%mm5\n"
sl@0
  1722
      "  paddsw %%mm5,%%mm4\n"
sl@0
  1723
      "  movq (%[s2_nx4_off]), %%mm5\n"
sl@0
  1724
      "  pmullw %%mm3, %%mm5\n"
sl@0
  1725
      "  paddsw %%mm5,%%mm4\n"
sl@0
  1726
      "  paddsw %%mm7, %%mm4\n"
sl@0
  1727
      "  psraw %%mm6, %%mm4\n"
sl@0
  1728
      "  paddsw (%[s1]),%%mm4\n"
sl@0
  1729
      "  movq %%mm4, 0(%[d])\n"
sl@0
  1730
sl@0
  1731
      "  addl $8, %[s2_nx4]\n"
sl@0
  1732
      "  addl $8, %[s2_nx4_off]\n"
sl@0
  1733
      "  addl $8, %[s1]\n"
sl@0
  1734
      "  addl $8, %[d]\n"
sl@0
  1735
      "  decl %[n]\n"
sl@0
  1736
      "  jnz 1b\n"
sl@0
  1737
      "  emms\n"
sl@0
  1738
      : [s2_nx4] "+r" (s2_nx4),
sl@0
  1739
        [d] "+r" (d),
sl@0
  1740
        [s2_nx4_off] "+r" (s2_nx4_off),
sl@0
  1741
        [n] "+m" (n),
sl@0
  1742
        [s1] "+r" (s1)
sl@0
  1743
      : [sstr] "r" (sstr2)
sl@0
  1744
      );
sl@0
  1745
}
sl@0
  1746
OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
sl@0
  1747
    OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
sl@0
  1748
#endif
sl@0
  1749
sl@0
  1750
void
sl@0
  1751
mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
sl@0
  1752
    int16_t *s3_4, int16_t *s4_2, int n)
sl@0
  1753
{
sl@0
  1754
  int16_t *s2_nx4_off;
sl@0
  1755
sl@0
  1756
  while (n&3) {
sl@0
  1757
    int x;
sl@0
  1758
    int j;
sl@0
  1759
    x = s4_2[0];
sl@0
  1760
    for(j=0;j<4;j++){
sl@0
  1761
      x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
sl@0
  1762
    }
sl@0
  1763
    x >>= s4_2[1];
sl@0
  1764
    d[0] = s1[0] + x;
sl@0
  1765
sl@0
  1766
    n--;
sl@0
  1767
    d++;
sl@0
  1768
    s1++;
sl@0
  1769
    s2_nx4++;
sl@0
  1770
  }
sl@0
  1771
  if (n==0) return;
sl@0
  1772
sl@0
  1773
  s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
sl@0
  1774
sl@0
  1775
  n >>= 2;
sl@0
  1776
  __asm__ __volatile__ ("\n"
sl@0
  1777
      "  movq 0(%[s3_4]), %%mm0\n"
sl@0
  1778
      "  pxor %%mm5, %%mm5\n"
sl@0
  1779
      "  movd 0(%[s4_2]), %%mm5\n"
sl@0
  1780
      :
sl@0
  1781
      : [s3_4] "r" (s3_4),
sl@0
  1782
        [s4_2] "r" (s4_2)
sl@0
  1783
      );
sl@0
  1784
sl@0
  1785
  __asm__ __volatile__ ("\n"
sl@0
  1786
      "1:\n"
sl@0
  1787
      "  pshufw $0x00, %%mm0, %%mm6\n"
sl@0
  1788
      "  pmullw 0(%[s2_nx4]), %%mm6\n"
sl@0
  1789
      "  pshufw $0x00, %%mm0, %%mm3\n"
sl@0
  1790
      "  pmulhw 0(%[s2_nx4]), %%mm3\n"
sl@0
  1791
      "  movq %%mm6, %%mm7\n"
sl@0
  1792
      "  punpcklwd %%mm3, %%mm6\n"
sl@0
  1793
      "  punpckhwd %%mm3, %%mm7\n"
sl@0
  1794
sl@0
  1795
      "  pshufw $0x55, %%mm0, %%mm2\n"
sl@0
  1796
      "  pmullw 0(%[s2_nx4],%[sstr]), %%mm2\n"
sl@0
  1797
      "  pshufw $0x55, %%mm0, %%mm3\n"
sl@0
  1798
      "  pmulhw 0(%[s2_nx4],%[sstr]), %%mm3\n"
sl@0
  1799
      "  movq %%mm2, %%mm4\n"
sl@0
  1800
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1801
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1802
      "  paddd %%mm2, %%mm6\n"
sl@0
  1803
      "  paddd %%mm4, %%mm7\n"
sl@0
  1804
sl@0
  1805
      "  pshufw $0xaa, %%mm0, %%mm2\n"
sl@0
  1806
      "  pmullw 0(%[s2_nx4],%[sstr],2), %%mm2\n"
sl@0
  1807
      "  pshufw $0xaa, %%mm0, %%mm3\n"
sl@0
  1808
      "  pmulhw 0(%[s2_nx4],%[sstr],2), %%mm3\n"
sl@0
  1809
      "  movq %%mm2, %%mm4\n"
sl@0
  1810
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1811
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1812
      "  paddd %%mm2, %%mm6\n"
sl@0
  1813
      "  paddd %%mm4, %%mm7\n"
sl@0
  1814
sl@0
  1815
      "  pshufw $0xff, %%mm0, %%mm2\n"
sl@0
  1816
      "  pmullw 0(%[s2_nx4_off]), %%mm2\n"
sl@0
  1817
      "  pshufw $0xff, %%mm0, %%mm3\n"
sl@0
  1818
      "  pmulhw 0(%[s2_nx4_off]), %%mm3\n"
sl@0
  1819
      "  movq %%mm2, %%mm4\n"
sl@0
  1820
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1821
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1822
      "  paddd %%mm2, %%mm6\n"
sl@0
  1823
      "  paddd %%mm4, %%mm7\n"
sl@0
  1824
sl@0
  1825
      "  pshufw $0xcc, %%mm5, %%mm1\n"
sl@0
  1826
      "  paddd %%mm1, %%mm6\n"
sl@0
  1827
      "  paddd %%mm1, %%mm7\n"
sl@0
  1828
sl@0
  1829
      "  pshufw $0xfd, %%mm5, %%mm1\n"
sl@0
  1830
      "  psrad %%mm1, %%mm6\n"
sl@0
  1831
      "  psrad %%mm1, %%mm7\n"
sl@0
  1832
      "  packssdw %%mm7, %%mm6\n"
sl@0
  1833
sl@0
  1834
      "  paddsw (%[s1]),%%mm6\n"
sl@0
  1835
      "  movq %%mm6, 0(%[d])\n"
sl@0
  1836
sl@0
  1837
      "  addl $8, %[s2_nx4]\n"
sl@0
  1838
      "  addl $8, %[s2_nx4_off]\n"
sl@0
  1839
      "  addl $8, %[s1]\n"
sl@0
  1840
      "  addl $8, %[d]\n"
sl@0
  1841
      "  decl %[n]\n"
sl@0
  1842
      "  jnz 1b\n"
sl@0
  1843
      "  emms\n"
sl@0
  1844
      : [s2_nx4] "+r" (s2_nx4),
sl@0
  1845
        [d] "+r" (d),
sl@0
  1846
        [s2_nx4_off] "+r" (s2_nx4_off),
sl@0
  1847
        [n] "+m" (n),
sl@0
  1848
        [s1] "+r" (s1)
sl@0
  1849
      : [sstr] "r" (sstr2)
sl@0
  1850
      );
sl@0
  1851
}
sl@0
  1852
OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
sl@0
  1853
    OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
sl@0
  1854
sl@0
  1855
void
sl@0
  1856
mas8_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx8, int sstr2,
sl@0
  1857
    int16_t *s3_8, int16_t *s4_2, int n)
sl@0
  1858
{
sl@0
  1859
  int16_t *s2_nx8_off;
sl@0
  1860
  void *tmp = NULL;
sl@0
  1861
sl@0
  1862
  while (n&3) {
sl@0
  1863
    int x;
sl@0
  1864
    int j;
sl@0
  1865
    x = s4_2[0];
sl@0
  1866
    for(j=0;j<8;j++){
sl@0
  1867
      x += OIL_GET(s2_nx8, j*sstr2, int16_t)*s3_8[j];
sl@0
  1868
    }
sl@0
  1869
    x >>= s4_2[1];
sl@0
  1870
    d[0] = s1[0] + x;
sl@0
  1871
sl@0
  1872
    n--;
sl@0
  1873
    d++;
sl@0
  1874
    s1++;
sl@0
  1875
    s2_nx8++;
sl@0
  1876
  }
sl@0
  1877
  if (n==0) return;
sl@0
  1878
sl@0
  1879
  s2_nx8_off = OIL_OFFSET(s2_nx8, 7*sstr2);
sl@0
  1880
sl@0
  1881
  n >>= 2;
sl@0
  1882
  __asm__ __volatile__ ("\n"
sl@0
  1883
      "  movq 0(%[s3_8]), %%mm0\n"
sl@0
  1884
      "  pxor %%mm5, %%mm5\n"
sl@0
  1885
      "  movd 0(%[s4_2]), %%mm5\n"
sl@0
  1886
      :
sl@0
  1887
      : [s3_8] "r" (s3_8),
sl@0
  1888
        [s4_2] "r" (s4_2)
sl@0
  1889
      );
sl@0
  1890
sl@0
  1891
  __asm__ __volatile__ ("\n"
sl@0
  1892
      "1:\n"
sl@0
  1893
      "  movl %[s2_nx8], %[tmp]\n"
sl@0
  1894
      "  movq 0(%[s3_8]), %%mm0\n"
sl@0
  1895
sl@0
  1896
      "  pshufw $0x00, %%mm0, %%mm6\n"
sl@0
  1897
      "  pmullw 0(%[tmp]), %%mm6\n"
sl@0
  1898
      "  pshufw $0x00, %%mm0, %%mm3\n"
sl@0
  1899
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1900
      "  movq %%mm6, %%mm7\n"
sl@0
  1901
      "  punpcklwd %%mm3, %%mm6\n"
sl@0
  1902
      "  punpckhwd %%mm3, %%mm7\n"
sl@0
  1903
sl@0
  1904
      "  addl %[sstr], %[tmp]\n"
sl@0
  1905
      "  pshufw $0x55, %%mm0, %%mm2\n"
sl@0
  1906
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1907
      "  pshufw $0x55, %%mm0, %%mm3\n"
sl@0
  1908
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1909
      "  movq %%mm2, %%mm4\n"
sl@0
  1910
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1911
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1912
      "  paddd %%mm2, %%mm6\n"
sl@0
  1913
      "  paddd %%mm4, %%mm7\n"
sl@0
  1914
sl@0
  1915
      "  addl %[sstr], %[tmp]\n"
sl@0
  1916
      "  pshufw $0xaa, %%mm0, %%mm2\n"
sl@0
  1917
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1918
      "  pshufw $0xaa, %%mm0, %%mm3\n"
sl@0
  1919
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1920
      "  movq %%mm2, %%mm4\n"
sl@0
  1921
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1922
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1923
      "  paddd %%mm2, %%mm6\n"
sl@0
  1924
      "  paddd %%mm4, %%mm7\n"
sl@0
  1925
sl@0
  1926
      "  addl %[sstr], %[tmp]\n"
sl@0
  1927
      "  pshufw $0xff, %%mm0, %%mm2\n"
sl@0
  1928
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1929
      "  pshufw $0xff, %%mm0, %%mm3\n"
sl@0
  1930
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1931
      "  movq %%mm2, %%mm4\n"
sl@0
  1932
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1933
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1934
      "  paddd %%mm2, %%mm6\n"
sl@0
  1935
      "  paddd %%mm4, %%mm7\n"
sl@0
  1936
sl@0
  1937
      "  movq 8(%[s3_8]), %%mm0\n"
sl@0
  1938
sl@0
  1939
      "  addl %[sstr], %[tmp]\n"
sl@0
  1940
      "  pshufw $0x00, %%mm0, %%mm2\n"
sl@0
  1941
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1942
      "  pshufw $0x00, %%mm0, %%mm3\n"
sl@0
  1943
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1944
      "  movq %%mm2, %%mm4\n"
sl@0
  1945
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1946
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1947
      "  paddd %%mm2, %%mm6\n"
sl@0
  1948
      "  paddd %%mm4, %%mm7\n"
sl@0
  1949
sl@0
  1950
      "  addl %[sstr], %[tmp]\n"
sl@0
  1951
      "  pshufw $0x55, %%mm0, %%mm2\n"
sl@0
  1952
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1953
      "  pshufw $0x55, %%mm0, %%mm3\n"
sl@0
  1954
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1955
      "  movq %%mm2, %%mm4\n"
sl@0
  1956
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1957
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1958
      "  paddd %%mm2, %%mm6\n"
sl@0
  1959
      "  paddd %%mm4, %%mm7\n"
sl@0
  1960
sl@0
  1961
      "  addl %[sstr], %[tmp]\n"
sl@0
  1962
      "  pshufw $0xaa, %%mm0, %%mm2\n"
sl@0
  1963
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1964
      "  pshufw $0xaa, %%mm0, %%mm3\n"
sl@0
  1965
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1966
      "  movq %%mm2, %%mm4\n"
sl@0
  1967
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1968
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1969
      "  paddd %%mm2, %%mm6\n"
sl@0
  1970
      "  paddd %%mm4, %%mm7\n"
sl@0
  1971
sl@0
  1972
      "  addl %[sstr], %[tmp]\n"
sl@0
  1973
      "  pshufw $0xff, %%mm0, %%mm2\n"
sl@0
  1974
      "  pmullw 0(%[tmp]), %%mm2\n"
sl@0
  1975
      "  pshufw $0xff, %%mm0, %%mm3\n"
sl@0
  1976
      "  pmulhw 0(%[tmp]), %%mm3\n"
sl@0
  1977
      "  movq %%mm2, %%mm4\n"
sl@0
  1978
      "  punpcklwd %%mm3, %%mm2\n"
sl@0
  1979
      "  punpckhwd %%mm3, %%mm4\n"
sl@0
  1980
      "  paddd %%mm2, %%mm6\n"
sl@0
  1981
      "  paddd %%mm4, %%mm7\n"
sl@0
  1982
sl@0
  1983
      "  pshufw $0xcc, %%mm5, %%mm1\n"
sl@0
  1984
      "  paddd %%mm1, %%mm6\n"
sl@0
  1985
      "  paddd %%mm1, %%mm7\n"
sl@0
  1986
sl@0
  1987
      "  pshufw $0xfd, %%mm5, %%mm1\n"
sl@0
  1988
      "  psrad %%mm1, %%mm6\n"
sl@0
  1989
      "  psrad %%mm1, %%mm7\n"
sl@0
  1990
      "  packssdw %%mm7, %%mm6\n"
sl@0
  1991
sl@0
  1992
      "  paddsw (%[s1]),%%mm6\n"
sl@0
  1993
      "  movq %%mm6, 0(%[d])\n"
sl@0
  1994
sl@0
  1995
      "  addl $8, %[s2_nx8]\n"
sl@0
  1996
      "  addl $8, %[s1]\n"
sl@0
  1997
      "  addl $8, %[d]\n"
sl@0
  1998
      "  decl %[n]\n"
sl@0
  1999
      "  jnz 1b\n"
sl@0
  2000
      "  emms\n"
sl@0
  2001
      : [s2_nx8] "+r" (s2_nx8),
sl@0
  2002
        [tmp] "+r" (tmp),
sl@0
  2003
        [s3_8] "+r" (s3_8),
sl@0
  2004
        [d] "+r" (d),
sl@0
  2005
        [n] "+m" (n),
sl@0
  2006
        [s1] "+r" (s1)
sl@0
  2007
      : [sstr] "m" (sstr2)
sl@0
  2008
      );
sl@0
  2009
}
sl@0
  2010
OIL_DEFINE_IMPL_FULL (mas8_across_add_s16_mmx, mas8_across_add_s16,
sl@0
  2011
    OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
sl@0
  2012
sl@0
  2013
void
sl@0
  2014
lshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
sl@0
  2015
{
sl@0
  2016
  while (n&3) {
sl@0
  2017
    d1[0] = s1[0]<<s3_1[0];
sl@0
  2018
    d1++;
sl@0
  2019
    s1++;
sl@0
  2020
    n--;
sl@0
  2021
  }
sl@0
  2022
  n >>= 2;
sl@0
  2023
  __asm__ __volatile__ ("\n"
sl@0
  2024
      "  movzwl 0(%[s3_1]), %%ecx\n"
sl@0
  2025
      "  movd %%ecx, %%mm1\n"
sl@0
  2026
      "1:\n"
sl@0
  2027
      "  movq 0(%[s1]), %%mm0\n"
sl@0
  2028
      "  psllw %%mm1, %%mm0\n"
sl@0
  2029
      "  movq %%mm0, 0(%[d1])\n"
sl@0
  2030
      "  add $8, %[d1]\n"
sl@0
  2031
      "  add $8, %[s1]\n"
sl@0
  2032
      "  decl %[n]\n"
sl@0
  2033
      "  jnz 1b\n"
sl@0
  2034
      "  emms"
sl@0
  2035
      : [d1] "+r" (d1),
sl@0
  2036
        [s1] "+r" (s1),
sl@0
  2037
        [n] "+r" (n)
sl@0
  2038
      : [s3_1] "r" (s3_1)
sl@0
  2039
      : "ecx");
sl@0
  2040
}
sl@0
  2041
OIL_DEFINE_IMPL_FULL (lshift_s16_mmx, lshift_s16, OIL_IMPL_FLAG_MMX);
sl@0
  2042
sl@0
  2043
void
sl@0
  2044
lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
sl@0
  2045
{
sl@0
  2046
  while (n&7) {
sl@0
  2047
    d1[0] = s1[0]<<s3_1[0];
sl@0
  2048
    d1++;
sl@0
  2049
    s1++;
sl@0
  2050
    n--;
sl@0
  2051
  }
sl@0
  2052
  n >>= 3;
sl@0
  2053
  if (n == 0) return;
sl@0
  2054
  __asm__ __volatile__ ("\n"
sl@0
  2055
      "  movzwl 0(%[s3_1]), %%ecx\n"
sl@0
  2056
      "  movd %%ecx, %%mm1\n"
sl@0
  2057
      "1:\n"
sl@0
  2058
      "  movq 0(%[s1]), %%mm0\n"
sl@0
  2059
      "  psllw %%mm1, %%mm0\n"
sl@0
  2060
      "  movq %%mm0, 0(%[d1])\n"
sl@0
  2061
      "  movq 8(%[s1]), %%mm0\n"
sl@0
  2062
      "  psllw %%mm1, %%mm0\n"
sl@0
  2063
      "  movq %%mm0, 8(%[d1])\n"
sl@0
  2064
      "  add $16, %[d1]\n"
sl@0
  2065
      "  add $16, %[s1]\n"
sl@0
  2066
      "  decl %[n]\n"
sl@0
  2067
      "  jnz 1b\n"
sl@0
  2068
      "  emms"
sl@0
  2069
      : [d1] "+r" (d1),
sl@0
  2070
        [s1] "+r" (s1),
sl@0
  2071
        [n] "+r" (n)
sl@0
  2072
      : [s3_1] "r" (s3_1)
sl@0
  2073
      : "ecx");
sl@0
  2074
}
sl@0
  2075
OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
sl@0
  2076
sl@0
  2077
sl@0
  2078
sl@0
  2079
sl@0
  2080
#ifdef	__SYMBIAN32__
sl@0
  2081
 
sl@0
  2082
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx, deinterleave2_s16() {
sl@0
  2083
		return &_oil_function_impl_deinterleave2_mmx, deinterleave2_s16;
sl@0
  2084
}
sl@0
  2085
#endif
sl@0
  2086
sl@0
  2087
#ifdef	__SYMBIAN32__
sl@0
  2088
 
sl@0
  2089
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16() {
sl@0
  2090
		return &_oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16;
sl@0
  2091
}
sl@0
  2092
#endif
sl@0
  2093
sl@0
  2094
#ifdef	__SYMBIAN32__
sl@0
  2095
 
sl@0
  2096
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16() {
sl@0
  2097
		return &_oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16;
sl@0
  2098
}
sl@0
  2099
#endif
sl@0
  2100
sl@0
  2101
#ifdef	__SYMBIAN32__
sl@0
  2102
 
sl@0
  2103
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16() {
sl@0
  2104
		return &_oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16;
sl@0
  2105
}
sl@0
  2106
#endif
sl@0
  2107
sl@0
  2108
#ifdef	__SYMBIAN32__
sl@0
  2109
 
sl@0
  2110
OilFunctionImpl* __oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12() {
sl@0
  2111
		return &_oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12;
sl@0
  2112
}
sl@0
  2113
#endif
sl@0
  2114
sl@0
  2115
#ifdef	__SYMBIAN32__
sl@0
  2116
 
sl@0
  2117
OilFunctionImpl* __oil_function_impl_interleave2_mmx, interleave2_s16() {
sl@0
  2118
		return &_oil_function_impl_interleave2_mmx, interleave2_s16;
sl@0
  2119
}
sl@0
  2120
#endif
sl@0
  2121
sl@0
  2122
#ifdef	__SYMBIAN32__
sl@0
  2123
 
sl@0
  2124
OilFunctionImpl* __oil_function_impl_lift_add_shift1_mmx, lift_add_shift1() {
sl@0
  2125
		return &_oil_function_impl_lift_add_shift1_mmx, lift_add_shift1;
sl@0
  2126
}
sl@0
  2127
#endif
sl@0
  2128
sl@0
  2129
#ifdef	__SYMBIAN32__
sl@0
  2130
 
sl@0
  2131
OilFunctionImpl* __oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1() {
sl@0
  2132
		return &_oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1;
sl@0
  2133
}
sl@0
  2134
#endif
sl@0
  2135
sl@0
  2136
#ifdef	__SYMBIAN32__
sl@0
  2137
 
sl@0
  2138
OilFunctionImpl* __oil_function_impl_lift_add_shift2_mmx, lift_add_shift2() {
sl@0
  2139
		return &_oil_function_impl_lift_add_shift2_mmx, lift_add_shift2;
sl@0
  2140
}
sl@0
  2141
#endif
sl@0
  2142
sl@0
  2143
#ifdef	__SYMBIAN32__
sl@0
  2144
 
sl@0
  2145
OilFunctionImpl* __oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2() {
sl@0
  2146
		return &_oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2;
sl@0
  2147
}
sl@0
  2148
#endif
sl@0
  2149
sl@0
  2150
#ifdef	__SYMBIAN32__
sl@0
  2151
 
sl@0
  2152
OilFunctionImpl* __oil_function_impl_synth_53_mmx, synth_53() {
sl@0
  2153
		return &_oil_function_impl_synth_53_mmx, synth_53;
sl@0
  2154
}
sl@0
  2155
#endif
sl@0
  2156
sl@0
  2157
#ifdef	__SYMBIAN32__
sl@0
  2158
 
sl@0
  2159
OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
sl@0
  2160
		return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
sl@0
  2161
}
sl@0
  2162
#endif
sl@0
  2163
sl@0
  2164
#ifdef	__SYMBIAN32__
sl@0
  2165
 
sl@0
  2166
OilFunctionImpl* __oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16() {
sl@0
  2167
		return &_oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16;
sl@0
  2168
}
sl@0
  2169
#endif
sl@0
  2170
sl@0
  2171
#ifdef	__SYMBIAN32__
sl@0
  2172
 
sl@0
  2173
OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
sl@0
  2174
		return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
sl@0
  2175
}
sl@0
  2176
#endif
sl@0
  2177
sl@0
  2178
#ifdef	__SYMBIAN32__
sl@0
  2179
 
sl@0
  2180
OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
sl@0
  2181
		return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
sl@0
  2182
}
sl@0
  2183
#endif
sl@0
  2184
sl@0
  2185
#ifdef	__SYMBIAN32__
sl@0
  2186
 
sl@0
  2187
OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
sl@0
  2188
		return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
sl@0
  2189
}
sl@0
  2190
#endif
sl@0
  2191
sl@0
  2192
#ifdef	__SYMBIAN32__
sl@0
  2193
 
sl@0
  2194
OilFunctionImpl* __oil_function_impl_mas8_add_s16_mmx, mas8_add_s16() {
sl@0
  2195
		return &_oil_function_impl_mas8_add_s16_mmx, mas8_add_s16;
sl@0
  2196
}
sl@0
  2197
#endif
sl@0
  2198
sl@0
  2199
#ifdef	__SYMBIAN32__
sl@0
  2200
 
sl@0
  2201
OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16() {
sl@0
  2202
		return &_oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16;
sl@0
  2203
}
sl@0
  2204
#endif
sl@0
  2205
sl@0
  2206
#ifdef	__SYMBIAN32__
sl@0
  2207
 
sl@0
  2208
OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16() {
sl@0
  2209
		return &_oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16;
sl@0
  2210
}
sl@0
  2211
#endif
sl@0
  2212
sl@0
  2213
#ifdef	__SYMBIAN32__
sl@0
  2214
 
sl@0
  2215
OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16() {
sl@0
  2216
		return &_oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16;
sl@0
  2217
}
sl@0
  2218
#endif
sl@0
  2219
sl@0
  2220
#ifdef	__SYMBIAN32__
sl@0
  2221
 
sl@0
  2222
OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16() {
sl@0
  2223
		return &_oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16;
sl@0
  2224
}
sl@0
  2225
#endif
sl@0
  2226
sl@0
  2227
#ifdef	__SYMBIAN32__
sl@0
  2228
 
sl@0
  2229
OilFunctionImpl* __oil_function_impl_mas8_add_s16_sse2, mas8_add_s16() {
sl@0
  2230
		return &_oil_function_impl_mas8_add_s16_sse2, mas8_add_s16;
sl@0
  2231
}
sl@0
  2232
#endif
sl@0
  2233
sl@0
  2234
#ifdef	__SYMBIAN32__
sl@0
  2235
 
sl@0
  2236
OilFunctionImpl* __oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16() {
sl@0
  2237
		return &_oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16;
sl@0
  2238
}
sl@0
  2239
#endif
sl@0
  2240
sl@0
  2241
#ifdef	__SYMBIAN32__
sl@0
  2242
 
sl@0
  2243
OilFunctionImpl* __oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16() {
sl@0
  2244
		return &_oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16;
sl@0
  2245
}
sl@0
  2246
#endif
sl@0
  2247
sl@0
  2248
#ifdef	__SYMBIAN32__
sl@0
  2249
 
sl@0
  2250
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16() {
sl@0
  2251
		return &_oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16;
sl@0
  2252
}
sl@0
  2253
#endif
sl@0
  2254
sl@0
  2255
#ifdef	__SYMBIAN32__
sl@0
  2256
 
sl@0
  2257
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8() {
sl@0
  2258
		return &_oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8;
sl@0
  2259
}
sl@0
  2260
#endif
sl@0
  2261
sl@0
  2262
#ifdef	__SYMBIAN32__
sl@0
  2263
 
sl@0
  2264
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8() {
sl@0
  2265
		return &_oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8;
sl@0
  2266
}
sl@0
  2267
#endif
sl@0
  2268
sl@0
  2269
#ifdef	__SYMBIAN32__
sl@0
  2270
 
sl@0
  2271
OilFunctionImpl* __oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx() {
sl@0
  2272
		return &_oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx;
sl@0
  2273
}
sl@0
  2274
#endif
sl@0
  2275
sl@0
  2276
#ifdef	__SYMBIAN32__
sl@0
  2277
 
sl@0
  2278
OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
sl@0
  2279
		return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
sl@0
  2280
}
sl@0
  2281
#endif
sl@0
  2282
sl@0
  2283
#ifdef	__SYMBIAN32__
sl@0
  2284
 
sl@0
  2285
OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
sl@0
  2286
		return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
sl@0
  2287
}
sl@0
  2288
#endif
sl@0
  2289
sl@0
  2290
#ifdef	__SYMBIAN32__
sl@0
  2291
 
sl@0
  2292
OilFunctionImpl* __oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16() {
sl@0
  2293
		return &_oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16;
sl@0
  2294
}
sl@0
  2295
#endif
sl@0
  2296
sl@0
  2297
#ifdef	__SYMBIAN32__
sl@0
  2298
 
sl@0
  2299
OilFunctionImpl* __oil_function_impl_lshift_s16_mmx, lshift_s16() {
sl@0
  2300
		return &_oil_function_impl_lshift_s16_mmx, lshift_s16;
sl@0
  2301
}
sl@0
  2302
#endif
sl@0
  2303
sl@0
  2304
#ifdef	__SYMBIAN32__
sl@0
  2305
 
sl@0
  2306
OilFunctionImpl* __oil_function_impl_lshift_s16_mmx_2, lshift_s16() {
sl@0
  2307
		return &_oil_function_impl_lshift_s16_mmx_2, lshift_s16;
sl@0
  2308
}
sl@0
  2309
#endif
sl@0
  2310
sl@0
  2311
sl@0
  2312
sl@0
  2313
#ifdef	__SYMBIAN32__
sl@0
  2314
 
sl@0
  2315
OilFunctionImpl* __oil_function_impl_split_53_nomix() {
sl@0
  2316
		return &_oil_function_impl_split_53_nomix;
sl@0
  2317
}
sl@0
  2318
#endif
sl@0
  2319
sl@0
  2320
#ifdef	__SYMBIAN32__
sl@0
  2321
 
sl@0
  2322
OilFunctionImpl* __oil_function_impl_split_53_c() {
sl@0
  2323
		return &_oil_function_impl_split_53_c;
sl@0
  2324
}
sl@0
  2325
#endif
sl@0
  2326
sl@0
  2327
#ifdef	__SYMBIAN32__
sl@0
  2328
 
sl@0
  2329
OilFunctionImpl* __oil_function_impl_synth_53_c() {
sl@0
  2330
		return &_oil_function_impl_synth_53_c;
sl@0
  2331
}
sl@0
  2332
#endif
sl@0
  2333
sl@0
  2334
#ifdef	__SYMBIAN32__
sl@0
  2335
 
sl@0
  2336
OilFunctionImpl* __oil_function_impl_deinterleave2_c_1() {
sl@0
  2337
		return &_oil_function_impl_deinterleave2_c_1;
sl@0
  2338
}
sl@0
  2339
#endif
sl@0
  2340
sl@0
  2341
#ifdef	__SYMBIAN32__
sl@0
  2342
 
sl@0
  2343
OilFunctionImpl* __oil_function_impl_deinterleave2_asm() {
sl@0
  2344
		return &_oil_function_impl_deinterleave2_asm;
sl@0
  2345
}
sl@0
  2346
#endif
sl@0
  2347