sl@0
|
1 |
//Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
|
sl@0
|
2 |
/*
|
sl@0
|
3 |
Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
|
sl@0
|
4 |
All rights reserved.
|
sl@0
|
5 |
|
sl@0
|
6 |
Redistribution and use in source and binary forms, with or without
|
sl@0
|
7 |
modification, are permitted provided that the following conditions
|
sl@0
|
8 |
are met:
|
sl@0
|
9 |
1. Redistributions of source code must retain the above copyright
|
sl@0
|
10 |
notice, this list of conditions and the following disclaimer.
|
sl@0
|
11 |
2. Redistributions in binary form must reproduce the above copyright
|
sl@0
|
12 |
notice, this list of conditions and the following disclaimer in the
|
sl@0
|
13 |
documentation and/or other materials provided with the distribution.
|
sl@0
|
14 |
|
sl@0
|
15 |
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
sl@0
|
16 |
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
sl@0
|
17 |
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
sl@0
|
18 |
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
sl@0
|
19 |
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
sl@0
|
20 |
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
sl@0
|
21 |
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
sl@0
|
22 |
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
sl@0
|
23 |
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
sl@0
|
24 |
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
sl@0
|
25 |
POSSIBILITY OF SUCH DAMAGE.
|
sl@0
|
26 |
*/
|
sl@0
|
27 |
|
sl@0
|
28 |
#include <liboil/liboilfunction.h>
|
sl@0
|
29 |
#include <liboil/liboilclasses.h>
|
sl@0
|
30 |
|
sl@0
|
31 |
|
sl@0
|
32 |
void
|
sl@0
|
33 |
split_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
|
sl@0
|
34 |
{
|
sl@0
|
35 |
int i;
|
sl@0
|
36 |
|
sl@0
|
37 |
if (n == 0) return;
|
sl@0
|
38 |
/* predict */
|
sl@0
|
39 |
for(i=1;i<n*2-2;i+=2){
|
sl@0
|
40 |
d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 1);
|
sl@0
|
41 |
}
|
sl@0
|
42 |
d_2xn[n*2-1] = s_2xn[n*2-1] - s_2xn[n*2-2];
|
sl@0
|
43 |
|
sl@0
|
44 |
/* update */
|
sl@0
|
45 |
d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
|
sl@0
|
46 |
for(i=2;i<n*2;i+=2){
|
sl@0
|
47 |
d_2xn[i] = s_2xn[i] + ((d_2xn[i-1] + d_2xn[i+1]) >> 2);
|
sl@0
|
48 |
}
|
sl@0
|
49 |
}
|
sl@0
|
50 |
OIL_DEFINE_IMPL (split_53_nomix, split_53);
|
sl@0
|
51 |
|
sl@0
|
52 |
#if 0
|
sl@0
|
53 |
void
|
sl@0
|
54 |
synth_53_nomix (int16_t *d_2xn, int16_t *s_2xn, int n)
|
sl@0
|
55 |
{
|
sl@0
|
56 |
int i;
|
sl@0
|
57 |
|
sl@0
|
58 |
/* predict */
|
sl@0
|
59 |
i_n[0] -= i_n[1] >> 1;
|
sl@0
|
60 |
for(i=2;i<n*2;i+=2){
|
sl@0
|
61 |
i_n[i] -= (i_n[i-1] + i_n[i+1]) >> 2;
|
sl@0
|
62 |
}
|
sl@0
|
63 |
|
sl@0
|
64 |
/* update */
|
sl@0
|
65 |
for(i=1;i<n*2-2;i+=2){
|
sl@0
|
66 |
i_n[i] += (i_n[i+1] + i_n[i-1]) >> 1;
|
sl@0
|
67 |
}
|
sl@0
|
68 |
i_n[n*2-1] += i_n[n*2-2];
|
sl@0
|
69 |
}
|
sl@0
|
70 |
#endif
|
sl@0
|
71 |
|
sl@0
|
72 |
|
sl@0
|
73 |
void
|
sl@0
|
74 |
split_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
|
sl@0
|
75 |
{
|
sl@0
|
76 |
int i;
|
sl@0
|
77 |
|
sl@0
|
78 |
if (n == 0) return;
|
sl@0
|
79 |
if (n == 1) {
|
sl@0
|
80 |
d_2xn[1] = s_2xn[1] - s_2xn[0];
|
sl@0
|
81 |
d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
|
sl@0
|
82 |
} else {
|
sl@0
|
83 |
d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
|
sl@0
|
84 |
d_2xn[0] = s_2xn[0] + (d_2xn[1] >> 1);
|
sl@0
|
85 |
d_2xn+=2;
|
sl@0
|
86 |
s_2xn+=2;
|
sl@0
|
87 |
for(i=0;i<(n*2-4)/2;i++){
|
sl@0
|
88 |
d_2xn[1] = s_2xn[1] - ((s_2xn[0] + s_2xn[2]) >> 1);
|
sl@0
|
89 |
d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
|
sl@0
|
90 |
d_2xn+=2;
|
sl@0
|
91 |
s_2xn+=2;
|
sl@0
|
92 |
}
|
sl@0
|
93 |
d_2xn[1] = s_2xn[1] - s_2xn[0];
|
sl@0
|
94 |
d_2xn[0] = s_2xn[0] + ((d_2xn[-1] + d_2xn[1]) >> 2);
|
sl@0
|
95 |
}
|
sl@0
|
96 |
}
|
sl@0
|
97 |
OIL_DEFINE_IMPL (split_53_c, split_53);
|
sl@0
|
98 |
|
sl@0
|
99 |
void
|
sl@0
|
100 |
synth_53_c (int16_t *d_2xn, int16_t *s_2xn, int n)
|
sl@0
|
101 |
{
|
sl@0
|
102 |
int i;
|
sl@0
|
103 |
|
sl@0
|
104 |
if (n == 0) return;
|
sl@0
|
105 |
if (n == 1) {
|
sl@0
|
106 |
d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
|
sl@0
|
107 |
d_2xn[1] = s_2xn[1] + d_2xn[0];
|
sl@0
|
108 |
} else {
|
sl@0
|
109 |
d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
|
sl@0
|
110 |
for(i=2;i<n*2-2;i+=2){
|
sl@0
|
111 |
d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
|
sl@0
|
112 |
d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
|
sl@0
|
113 |
}
|
sl@0
|
114 |
d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
|
sl@0
|
115 |
d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
|
sl@0
|
116 |
d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
|
sl@0
|
117 |
}
|
sl@0
|
118 |
}
|
sl@0
|
119 |
OIL_DEFINE_IMPL (synth_53_c, synth_53);
|
sl@0
|
120 |
|
sl@0
|
121 |
void
|
sl@0
|
122 |
deinterleave2_c_1 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
123 |
{
|
sl@0
|
124 |
int i;
|
sl@0
|
125 |
|
sl@0
|
126 |
for(i=0;i<n;i++) {
|
sl@0
|
127 |
d1[i] = s_2xn[2*i];
|
sl@0
|
128 |
d2[i] = s_2xn[2*i + 1];
|
sl@0
|
129 |
}
|
sl@0
|
130 |
}
|
sl@0
|
131 |
OIL_DEFINE_IMPL (deinterleave2_c_1, deinterleave2_s16);
|
sl@0
|
132 |
|
sl@0
|
133 |
void
|
sl@0
|
134 |
deinterleave2_asm (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
135 |
{
|
sl@0
|
136 |
if (n == 0) return;
|
sl@0
|
137 |
|
sl@0
|
138 |
while (n&1) {
|
sl@0
|
139 |
d1[0] = s_2xn[0];
|
sl@0
|
140 |
d2[0] = s_2xn[1];
|
sl@0
|
141 |
d1++;
|
sl@0
|
142 |
d2++;
|
sl@0
|
143 |
s_2xn+=2;
|
sl@0
|
144 |
n--;
|
sl@0
|
145 |
}
|
sl@0
|
146 |
|
sl@0
|
147 |
asm volatile ("\n"
|
sl@0
|
148 |
" mov %3, %%ecx\n"
|
sl@0
|
149 |
" sub $2, %%ecx\n"
|
sl@0
|
150 |
"1:\n"
|
sl@0
|
151 |
" movw (%1,%%ecx,4), %%ax\n"
|
sl@0
|
152 |
" movw %%ax, (%0,%%ecx,2)\n"
|
sl@0
|
153 |
" movw 2(%1,%%ecx,4), %%ax\n"
|
sl@0
|
154 |
" movw %%ax, (%2,%%ecx,2)\n"
|
sl@0
|
155 |
" movw 4(%1,%%ecx,4), %%ax\n"
|
sl@0
|
156 |
" movw %%ax, 2(%0,%%ecx,2)\n"
|
sl@0
|
157 |
" movw 6(%1,%%ecx,4), %%ax\n"
|
sl@0
|
158 |
" movw %%ax, 2(%2,%%ecx,2)\n"
|
sl@0
|
159 |
" sub $2, %%ecx\n"
|
sl@0
|
160 |
" jge 1b\n"
|
sl@0
|
161 |
: "+r" (d1), "+r" (s_2xn), "+r" (d2)
|
sl@0
|
162 |
: "m" (n)
|
sl@0
|
163 |
: "eax", "ecx");
|
sl@0
|
164 |
}
|
sl@0
|
165 |
OIL_DEFINE_IMPL (deinterleave2_asm, deinterleave2_s16);
|
sl@0
|
166 |
|
sl@0
|
167 |
void
|
sl@0
|
168 |
deinterleave2_mmx (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
169 |
{
|
sl@0
|
170 |
while (n&3) {
|
sl@0
|
171 |
d1[0] = s_2xn[0];
|
sl@0
|
172 |
d2[0] = s_2xn[1];
|
sl@0
|
173 |
d1++;
|
sl@0
|
174 |
d2++;
|
sl@0
|
175 |
s_2xn+=2;
|
sl@0
|
176 |
n--;
|
sl@0
|
177 |
}
|
sl@0
|
178 |
if (n==0) return;
|
sl@0
|
179 |
|
sl@0
|
180 |
asm volatile ("\n"
|
sl@0
|
181 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
182 |
"1:\n"
|
sl@0
|
183 |
" movq (%1,%%ecx,4), %%mm0\n"
|
sl@0
|
184 |
" movq 8(%1,%%ecx,4), %%mm1\n"
|
sl@0
|
185 |
" pslld $16, %%mm0\n"
|
sl@0
|
186 |
" pslld $16, %%mm1\n"
|
sl@0
|
187 |
" psrad $16, %%mm0\n"
|
sl@0
|
188 |
" psrad $16, %%mm1\n"
|
sl@0
|
189 |
" packssdw %%mm1, %%mm0\n"
|
sl@0
|
190 |
" movq %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
191 |
" movq (%1,%%ecx,4), %%mm0\n"
|
sl@0
|
192 |
" movq 8(%1,%%ecx,4), %%mm1\n"
|
sl@0
|
193 |
" psrad $16, %%mm0\n"
|
sl@0
|
194 |
" psrad $16, %%mm1\n"
|
sl@0
|
195 |
" packssdw %%mm1, %%mm0\n"
|
sl@0
|
196 |
" movq %%mm0, (%2,%%ecx,2)\n"
|
sl@0
|
197 |
" add $4, %%ecx\n"
|
sl@0
|
198 |
" cmp %3, %%ecx\n"
|
sl@0
|
199 |
" jl 1b\n"
|
sl@0
|
200 |
" emms\n"
|
sl@0
|
201 |
: "+r" (d1), "+r" (s_2xn), "+r" (d2)
|
sl@0
|
202 |
: "m" (n)
|
sl@0
|
203 |
: "eax", "ecx");
|
sl@0
|
204 |
}
|
sl@0
|
205 |
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
206 |
|
sl@0
|
207 |
void
|
sl@0
|
208 |
deinterleave2_mmx_2 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
209 |
{
|
sl@0
|
210 |
while (n&3) {
|
sl@0
|
211 |
d1[0] = s_2xn[0];
|
sl@0
|
212 |
d2[0] = s_2xn[1];
|
sl@0
|
213 |
d1++;
|
sl@0
|
214 |
d2++;
|
sl@0
|
215 |
s_2xn+=2;
|
sl@0
|
216 |
n--;
|
sl@0
|
217 |
}
|
sl@0
|
218 |
if (n==0) return;
|
sl@0
|
219 |
|
sl@0
|
220 |
asm volatile ("\n"
|
sl@0
|
221 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
222 |
"1:\n"
|
sl@0
|
223 |
" pshufw $0xd8, (%1,%%ecx,4), %%mm0\n"
|
sl@0
|
224 |
" movd %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
225 |
" pshufw $0x8d, (%1,%%ecx,4), %%mm0\n"
|
sl@0
|
226 |
" movd %%mm0, (%2,%%ecx,2)\n"
|
sl@0
|
227 |
" add $2, %%ecx\n"
|
sl@0
|
228 |
" cmp %3, %%ecx\n"
|
sl@0
|
229 |
" jl 1b\n"
|
sl@0
|
230 |
" emms\n"
|
sl@0
|
231 |
: "+r" (d1), "+r" (s_2xn), "+r" (d2)
|
sl@0
|
232 |
: "m" (n)
|
sl@0
|
233 |
: "eax", "ecx");
|
sl@0
|
234 |
}
|
sl@0
|
235 |
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_2, deinterleave2_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
236 |
|
sl@0
|
237 |
void
|
sl@0
|
238 |
deinterleave2_mmx_3 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
239 |
{
|
sl@0
|
240 |
while (n&3) {
|
sl@0
|
241 |
d1[0] = s_2xn[0];
|
sl@0
|
242 |
d2[0] = s_2xn[1];
|
sl@0
|
243 |
d1++;
|
sl@0
|
244 |
d2++;
|
sl@0
|
245 |
s_2xn+=2;
|
sl@0
|
246 |
n--;
|
sl@0
|
247 |
}
|
sl@0
|
248 |
if (n==0) return;
|
sl@0
|
249 |
|
sl@0
|
250 |
asm volatile ("\n"
|
sl@0
|
251 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
252 |
"1:\n"
|
sl@0
|
253 |
" movq (%1,%%ecx,4), %%mm1\n"
|
sl@0
|
254 |
" movq (%1,%%ecx,4), %%mm2\n"
|
sl@0
|
255 |
" movq 8(%1,%%ecx,4), %%mm0\n"
|
sl@0
|
256 |
" punpcklwd %%mm0, %%mm1\n"
|
sl@0
|
257 |
" punpckhwd %%mm0, %%mm2\n"
|
sl@0
|
258 |
" movq %%mm1, %%mm0\n"
|
sl@0
|
259 |
" punpcklwd %%mm2, %%mm0\n"
|
sl@0
|
260 |
" punpckhwd %%mm2, %%mm1\n"
|
sl@0
|
261 |
" movq %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
262 |
" movq %%mm1, (%2,%%ecx,2)\n"
|
sl@0
|
263 |
" add $4, %%ecx\n"
|
sl@0
|
264 |
" cmp %3, %%ecx\n"
|
sl@0
|
265 |
" jl 1b\n"
|
sl@0
|
266 |
" emms\n"
|
sl@0
|
267 |
: "+r" (d1), "+r" (s_2xn), "+r" (d2)
|
sl@0
|
268 |
: "m" (n)
|
sl@0
|
269 |
: "eax", "ecx");
|
sl@0
|
270 |
}
|
sl@0
|
271 |
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_3, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
272 |
|
sl@0
|
273 |
void
|
sl@0
|
274 |
deinterleave2_mmx_4 (int16_t *d1, int16_t *d2, int16_t *s_2xn, int n)
|
sl@0
|
275 |
{
|
sl@0
|
276 |
while (n&7) {
|
sl@0
|
277 |
d1[0] = s_2xn[0];
|
sl@0
|
278 |
d2[0] = s_2xn[1];
|
sl@0
|
279 |
d1++;
|
sl@0
|
280 |
d2++;
|
sl@0
|
281 |
s_2xn+=2;
|
sl@0
|
282 |
n--;
|
sl@0
|
283 |
}
|
sl@0
|
284 |
if (n==0) return;
|
sl@0
|
285 |
|
sl@0
|
286 |
asm volatile ("\n"
|
sl@0
|
287 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
288 |
"1:\n"
|
sl@0
|
289 |
" movq (%1,%%ecx,4), %%mm1\n"
|
sl@0
|
290 |
" movq %%mm1, %%mm2\n"
|
sl@0
|
291 |
" movq 8(%1,%%ecx,4), %%mm0\n"
|
sl@0
|
292 |
" movq 16(%1,%%ecx,4), %%mm5\n"
|
sl@0
|
293 |
" punpcklwd %%mm0, %%mm1\n"
|
sl@0
|
294 |
" movq %%mm5, %%mm6\n"
|
sl@0
|
295 |
" punpckhwd %%mm0, %%mm2\n"
|
sl@0
|
296 |
" movq 24(%1,%%ecx,4), %%mm4\n"
|
sl@0
|
297 |
" movq %%mm1, %%mm0\n"
|
sl@0
|
298 |
" punpcklwd %%mm4, %%mm5\n"
|
sl@0
|
299 |
" punpcklwd %%mm2, %%mm0\n"
|
sl@0
|
300 |
" punpckhwd %%mm4, %%mm6\n"
|
sl@0
|
301 |
" punpckhwd %%mm2, %%mm1\n"
|
sl@0
|
302 |
" movq %%mm5, %%mm4\n"
|
sl@0
|
303 |
" movq %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
304 |
" punpcklwd %%mm6, %%mm4\n"
|
sl@0
|
305 |
" movq %%mm1, (%2,%%ecx,2)\n"
|
sl@0
|
306 |
" punpckhwd %%mm6, %%mm5\n"
|
sl@0
|
307 |
" movq %%mm4, 8(%0,%%ecx,2)\n"
|
sl@0
|
308 |
" movq %%mm5, 8(%2,%%ecx,2)\n"
|
sl@0
|
309 |
" add $8, %%ecx\n"
|
sl@0
|
310 |
" cmp %3, %%ecx\n"
|
sl@0
|
311 |
" jl 1b\n"
|
sl@0
|
312 |
" emms\n"
|
sl@0
|
313 |
: "+r" (d1), "+r" (s_2xn), "+r" (d2)
|
sl@0
|
314 |
: "m" (n)
|
sl@0
|
315 |
: "eax", "ecx");
|
sl@0
|
316 |
}
|
sl@0
|
317 |
OIL_DEFINE_IMPL_FULL (deinterleave2_mmx_4, deinterleave2_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
318 |
|
sl@0
|
319 |
|
sl@0
|
320 |
void
|
sl@0
|
321 |
lift_add_mult_shift12_i386_mmx (int16_t *d, int16_t *s1, int16_t *s2,
|
sl@0
|
322 |
int16_t *s3, int16_t *s4, int n)
|
sl@0
|
323 |
{
|
sl@0
|
324 |
uint32_t val = *s4;
|
sl@0
|
325 |
|
sl@0
|
326 |
while (n&3) {
|
sl@0
|
327 |
d[0] = s1[0] + ((s4[0]*(s2[0] + s3[0]))>>12);
|
sl@0
|
328 |
d++;
|
sl@0
|
329 |
s1++;
|
sl@0
|
330 |
s2++;
|
sl@0
|
331 |
s3++;
|
sl@0
|
332 |
n--;
|
sl@0
|
333 |
}
|
sl@0
|
334 |
if (n==0) return;
|
sl@0
|
335 |
|
sl@0
|
336 |
val = ((*(uint16_t *)s4)<<16) | (*(uint16_t *)s4);
|
sl@0
|
337 |
n>>=2;
|
sl@0
|
338 |
asm volatile ("\n"
|
sl@0
|
339 |
" mov %4, %%ecx\n"
|
sl@0
|
340 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
341 |
" punpcklwd %%mm7, %%mm7\n"
|
sl@0
|
342 |
" mov %5, %%ecx\n"
|
sl@0
|
343 |
"1:\n"
|
sl@0
|
344 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
345 |
" paddsw 0(%3), %%mm0\n"
|
sl@0
|
346 |
" movq %%mm0, %%mm1\n"
|
sl@0
|
347 |
" pmullw %%mm7, %%mm0\n"
|
sl@0
|
348 |
" pmulhw %%mm7, %%mm1\n"
|
sl@0
|
349 |
" psrlw $12, %%mm0\n"
|
sl@0
|
350 |
" psllw $4, %%mm1\n"
|
sl@0
|
351 |
" por %%mm1, %%mm0\n"
|
sl@0
|
352 |
" paddsw 0(%1), %%mm0\n"
|
sl@0
|
353 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
354 |
" add $8, %0\n"
|
sl@0
|
355 |
" add $8, %1\n"
|
sl@0
|
356 |
" add $8, %2\n"
|
sl@0
|
357 |
" add $8, %3\n"
|
sl@0
|
358 |
" decl %%ecx\n"
|
sl@0
|
359 |
" jne 1b\n"
|
sl@0
|
360 |
" emms\n"
|
sl@0
|
361 |
: "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
|
sl@0
|
362 |
: "m" (val), "m" (n)
|
sl@0
|
363 |
: "ecx");
|
sl@0
|
364 |
}
|
sl@0
|
365 |
OIL_DEFINE_IMPL_FULL (lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12, OIL_IMPL_FLAG_MMX);
|
sl@0
|
366 |
|
sl@0
|
367 |
void
|
sl@0
|
368 |
interleave2_mmx (int16_t *d_2xn, int16_t *s1, int16_t *s2, int n)
|
sl@0
|
369 |
{
|
sl@0
|
370 |
while (n&3) {
|
sl@0
|
371 |
d_2xn[0] = s1[0];
|
sl@0
|
372 |
d_2xn[1] = s2[0];
|
sl@0
|
373 |
s1++;
|
sl@0
|
374 |
s2++;
|
sl@0
|
375 |
d_2xn+=2;
|
sl@0
|
376 |
n--;
|
sl@0
|
377 |
}
|
sl@0
|
378 |
if (n==0) return;
|
sl@0
|
379 |
|
sl@0
|
380 |
asm volatile ("\n"
|
sl@0
|
381 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
382 |
"1:\n"
|
sl@0
|
383 |
" movq (%1,%%ecx,2), %%mm0\n"
|
sl@0
|
384 |
" movq (%2,%%ecx,2), %%mm1\n"
|
sl@0
|
385 |
" movq %%mm0, %%mm2\n"
|
sl@0
|
386 |
" punpckhwd %%mm1, %%mm0\n"
|
sl@0
|
387 |
" punpcklwd %%mm1, %%mm2\n"
|
sl@0
|
388 |
" movq %%mm2, (%0,%%ecx,4)\n"
|
sl@0
|
389 |
" movq %%mm0, 8(%0,%%ecx,4)\n"
|
sl@0
|
390 |
" add $4, %%ecx\n"
|
sl@0
|
391 |
" cmp %3, %%ecx\n"
|
sl@0
|
392 |
" jl 1b\n"
|
sl@0
|
393 |
" emms\n"
|
sl@0
|
394 |
: "+r" (d_2xn), "+r" (s1), "+r" (s2)
|
sl@0
|
395 |
: "m" (n)
|
sl@0
|
396 |
: "eax", "ecx");
|
sl@0
|
397 |
}
|
sl@0
|
398 |
OIL_DEFINE_IMPL_FULL (interleave2_mmx, interleave2_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
399 |
|
sl@0
|
400 |
void
|
sl@0
|
401 |
lift_add_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
|
sl@0
|
402 |
{
|
sl@0
|
403 |
while (n&3) {
|
sl@0
|
404 |
d[0] = s1[0] + ((s2[0] + s3[0])>>1);
|
sl@0
|
405 |
d++;
|
sl@0
|
406 |
s1++;
|
sl@0
|
407 |
s2++;
|
sl@0
|
408 |
s3++;
|
sl@0
|
409 |
n--;
|
sl@0
|
410 |
}
|
sl@0
|
411 |
if (n==0) return;
|
sl@0
|
412 |
|
sl@0
|
413 |
asm volatile ("\n"
|
sl@0
|
414 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
415 |
"1:\n"
|
sl@0
|
416 |
" movq (%2,%%ecx,2), %%mm1\n"
|
sl@0
|
417 |
" movq (%3,%%ecx,2), %%mm2\n"
|
sl@0
|
418 |
" paddw %%mm2, %%mm1\n"
|
sl@0
|
419 |
" psraw $1, %%mm1\n"
|
sl@0
|
420 |
" paddw (%1,%%ecx,2), %%mm1\n"
|
sl@0
|
421 |
" movq %%mm1, (%0,%%ecx,2)\n"
|
sl@0
|
422 |
" add $4, %%ecx\n"
|
sl@0
|
423 |
" cmp %4, %%ecx\n"
|
sl@0
|
424 |
" jl 1b\n"
|
sl@0
|
425 |
" emms\n"
|
sl@0
|
426 |
: "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
|
sl@0
|
427 |
: "m" (n)
|
sl@0
|
428 |
: "ecx");
|
sl@0
|
429 |
}
|
sl@0
|
430 |
OIL_DEFINE_IMPL_FULL (lift_add_shift1_mmx, lift_add_shift1, OIL_IMPL_FLAG_MMX);
|
sl@0
|
431 |
|
sl@0
|
432 |
void
|
sl@0
|
433 |
lift_sub_shift1_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
|
sl@0
|
434 |
{
|
sl@0
|
435 |
while (n&3) {
|
sl@0
|
436 |
d[0] = s1[0] - ((s2[0] + s3[0])>>1);
|
sl@0
|
437 |
d++;
|
sl@0
|
438 |
s1++;
|
sl@0
|
439 |
s2++;
|
sl@0
|
440 |
s3++;
|
sl@0
|
441 |
n--;
|
sl@0
|
442 |
}
|
sl@0
|
443 |
if (n==0) return;
|
sl@0
|
444 |
|
sl@0
|
445 |
asm volatile ("\n"
|
sl@0
|
446 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
447 |
"1:\n"
|
sl@0
|
448 |
" movq (%2,%%ecx,2), %%mm1\n"
|
sl@0
|
449 |
" movq (%3,%%ecx,2), %%mm2\n"
|
sl@0
|
450 |
" movq (%1,%%ecx,2), %%mm0\n"
|
sl@0
|
451 |
" paddw %%mm2, %%mm1\n"
|
sl@0
|
452 |
" psraw $1, %%mm1\n"
|
sl@0
|
453 |
" psubw %%mm1, %%mm0\n"
|
sl@0
|
454 |
" movq %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
455 |
" add $4, %%ecx\n"
|
sl@0
|
456 |
" cmp %4, %%ecx\n"
|
sl@0
|
457 |
" jl 1b\n"
|
sl@0
|
458 |
" emms\n"
|
sl@0
|
459 |
: "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
|
sl@0
|
460 |
: "m" (n)
|
sl@0
|
461 |
: "ecx");
|
sl@0
|
462 |
}
|
sl@0
|
463 |
OIL_DEFINE_IMPL_FULL (lift_sub_shift1_mmx, lift_sub_shift1, OIL_IMPL_FLAG_MMX);
|
sl@0
|
464 |
|
sl@0
|
465 |
void
|
sl@0
|
466 |
lift_add_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
|
sl@0
|
467 |
{
|
sl@0
|
468 |
while (n&3) {
|
sl@0
|
469 |
d[0] = s1[0] + ((s2[0] + s3[0])>>2);
|
sl@0
|
470 |
d++;
|
sl@0
|
471 |
s1++;
|
sl@0
|
472 |
s2++;
|
sl@0
|
473 |
s3++;
|
sl@0
|
474 |
n--;
|
sl@0
|
475 |
}
|
sl@0
|
476 |
if (n==0) return;
|
sl@0
|
477 |
|
sl@0
|
478 |
asm volatile ("\n"
|
sl@0
|
479 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
480 |
"1:\n"
|
sl@0
|
481 |
" movq (%2,%%ecx,2), %%mm1\n"
|
sl@0
|
482 |
" movq (%3,%%ecx,2), %%mm2\n"
|
sl@0
|
483 |
" paddw %%mm2, %%mm1\n"
|
sl@0
|
484 |
" psraw $2, %%mm1\n"
|
sl@0
|
485 |
" paddw (%1,%%ecx,2), %%mm1\n"
|
sl@0
|
486 |
" movq %%mm1, (%0,%%ecx,2)\n"
|
sl@0
|
487 |
" add $4, %%ecx\n"
|
sl@0
|
488 |
" cmp %4, %%ecx\n"
|
sl@0
|
489 |
" jl 1b\n"
|
sl@0
|
490 |
" emms\n"
|
sl@0
|
491 |
: "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
|
sl@0
|
492 |
: "m" (n)
|
sl@0
|
493 |
: "ecx");
|
sl@0
|
494 |
}
|
sl@0
|
495 |
OIL_DEFINE_IMPL_FULL (lift_add_shift2_mmx, lift_add_shift2, OIL_IMPL_FLAG_MMX);
|
sl@0
|
496 |
|
sl@0
|
497 |
void
|
sl@0
|
498 |
lift_sub_shift2_mmx (int16_t *d, int16_t *s1, int16_t *s2, int16_t *s3, int n)
|
sl@0
|
499 |
{
|
sl@0
|
500 |
while (n&3) {
|
sl@0
|
501 |
d[0] = s1[0] - ((s2[0] + s3[0])>>2);
|
sl@0
|
502 |
d++;
|
sl@0
|
503 |
s1++;
|
sl@0
|
504 |
s2++;
|
sl@0
|
505 |
s3++;
|
sl@0
|
506 |
n--;
|
sl@0
|
507 |
}
|
sl@0
|
508 |
if (n==0) return;
|
sl@0
|
509 |
|
sl@0
|
510 |
asm volatile ("\n"
|
sl@0
|
511 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
512 |
"1:\n"
|
sl@0
|
513 |
" movq (%2,%%ecx,2), %%mm1\n"
|
sl@0
|
514 |
" movq (%3,%%ecx,2), %%mm2\n"
|
sl@0
|
515 |
" movq (%1,%%ecx,2), %%mm0\n"
|
sl@0
|
516 |
" paddw %%mm2, %%mm1\n"
|
sl@0
|
517 |
" psraw $2, %%mm1\n"
|
sl@0
|
518 |
" psubw %%mm1, %%mm0\n"
|
sl@0
|
519 |
" movq %%mm0, (%0,%%ecx,2)\n"
|
sl@0
|
520 |
" add $4, %%ecx\n"
|
sl@0
|
521 |
" cmp %4, %%ecx\n"
|
sl@0
|
522 |
" jl 1b\n"
|
sl@0
|
523 |
" emms\n"
|
sl@0
|
524 |
: "+r" (d), "+r" (s1), "+r" (s2), "+r" (s3)
|
sl@0
|
525 |
: "m" (n)
|
sl@0
|
526 |
: "ecx");
|
sl@0
|
527 |
}
|
sl@0
|
528 |
OIL_DEFINE_IMPL_FULL (lift_sub_shift2_mmx, lift_sub_shift2, OIL_IMPL_FLAG_MMX);
|
sl@0
|
529 |
|
sl@0
|
530 |
#ifdef ENABLE_BROKEN_IMPLS
|
sl@0
|
531 |
void
|
sl@0
|
532 |
synth_53_mmx (int16_t *d_2xn, int16_t *s_2xn, int n)
|
sl@0
|
533 |
{
|
sl@0
|
534 |
int i;
|
sl@0
|
535 |
|
sl@0
|
536 |
if (n==0) return;
|
sl@0
|
537 |
if (n == 1) {
|
sl@0
|
538 |
d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
|
sl@0
|
539 |
d_2xn[1] = s_2xn[1] + d_2xn[0];
|
sl@0
|
540 |
} else {
|
sl@0
|
541 |
int i;
|
sl@0
|
542 |
|
sl@0
|
543 |
d_2xn[0] = s_2xn[0] - (s_2xn[1] >> 1);
|
sl@0
|
544 |
|
sl@0
|
545 |
if (n > 6) {
|
sl@0
|
546 |
n-=5;
|
sl@0
|
547 |
|
sl@0
|
548 |
asm volatile ("\n"
|
sl@0
|
549 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
550 |
" movw 2(%1), %%ecx\n"
|
sl@0
|
551 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
552 |
" movw 0(%0), %%ecx\n"
|
sl@0
|
553 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
554 |
" movw 0(%1), %%ecx\n"
|
sl@0
|
555 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
556 |
|
sl@0
|
557 |
" xor %%ecx, %%ecx\n"
|
sl@0
|
558 |
"1:\n"
|
sl@0
|
559 |
" movq 4(%1,%%ecx,4), %%mm1\n" // mm1 = s5 s4 s3 s2
|
sl@0
|
560 |
" movq %%mm1, %%mm2\n" // mm2 = s5 s4 s3 s2
|
sl@0
|
561 |
" movq 12(%1,%%ecx,4), %%mm0\n" // mm0 = s9 s8 s7 s6
|
sl@0
|
562 |
" punpcklwd %%mm0, %%mm1\n" // mm1 = s7 s3 s6 s2
|
sl@0
|
563 |
" punpckhwd %%mm0, %%mm2\n" // mm2 = s9 s5 s8 s4
|
sl@0
|
564 |
" movq %%mm1, %%mm0\n" // mm0 = s7 s3 s6 s2
|
sl@0
|
565 |
" punpcklwd %%mm2, %%mm0\n" // mm0 = s8 s6 s4 s2
|
sl@0
|
566 |
" punpckhwd %%mm2, %%mm1\n" // mm1 = s9 s7 s5 s3
|
sl@0
|
567 |
//" movq %%mm0, %%mm3\n" // mm0 = s8 s6 s4 s2
|
sl@0
|
568 |
|
sl@0
|
569 |
" movq %%mm1, %%mm2\n" // mm2 = s9 s7 s5 s3
|
sl@0
|
570 |
" psllq $16, %%mm2\n" // mm2 = s7 s5 s3 00
|
sl@0
|
571 |
" por %%mm7, %%mm2\n" // mm2 = s7 s5 s3 s1
|
sl@0
|
572 |
" movq %%mm2, %%mm4\n" // mm4 = s7 s5 s3 s1
|
sl@0
|
573 |
" paddw %%mm1, %%mm2\n" // mm2 = s9+s7 ...
|
sl@0
|
574 |
" psraw $2, %%mm2\n" // mm2 = (s9+s7)>>2 ...
|
sl@0
|
575 |
" movq %%mm1, %%mm7\n" // mm7 = s9 s7 s5 s3
|
sl@0
|
576 |
" psrlq $48, %%mm7\n" // mm7 = 00 00 00 s9
|
sl@0
|
577 |
" psubw %%mm2, %%mm0\n" // mm0 = d8 d6 d4 d2
|
sl@0
|
578 |
|
sl@0
|
579 |
" movq %%mm0, %%mm1\n" // mm1 = d8 d6 d4 d2
|
sl@0
|
580 |
" movq %%mm0, %%mm3\n" // mm1 = d8 d6 d4 d2
|
sl@0
|
581 |
" psllq $16, %%mm0\n" // mm0 = d6 d4 d2 00
|
sl@0
|
582 |
" por %%mm6, %%mm0\n" // mm0 = d6 d4 d2 d0
|
sl@0
|
583 |
" psrlq $48, %%mm1\n" // mm1 = 00 00 00 d8
|
sl@0
|
584 |
" movq %%mm1, %%mm6\n" // mm6 = 00 00 00 d8
|
sl@0
|
585 |
|
sl@0
|
586 |
" movq %%mm0, %%mm1\n"
|
sl@0
|
587 |
" paddw %%mm3, %%mm1\n" // mm0 = d8+d6 ...
|
sl@0
|
588 |
" psraw $1, %%mm1\n" // mm1 = (d8+d6)>>1 ...
|
sl@0
|
589 |
" paddw %%mm4, %%mm1\n" // mm1 = d7 d5 d3 d1
|
sl@0
|
590 |
|
sl@0
|
591 |
" movq %%mm1, %%mm2\n"
|
sl@0
|
592 |
|
sl@0
|
593 |
" movq %%mm0, %%mm1\n"
|
sl@0
|
594 |
" punpcklwd %%mm2, %%mm0\n"
|
sl@0
|
595 |
" punpckhwd %%mm2, %%mm1\n"
|
sl@0
|
596 |
|
sl@0
|
597 |
" movq %%mm0, (%0, %%ecx, 4)\n"
|
sl@0
|
598 |
" movq %%mm1, 8(%0, %%ecx, 4)\n"
|
sl@0
|
599 |
|
sl@0
|
600 |
" add $4, %%ecx\n"
|
sl@0
|
601 |
" cmp %3, %%ecx\n"
|
sl@0
|
602 |
" jl 1b\n"
|
sl@0
|
603 |
" emms\n"
|
sl@0
|
604 |
: "+r" (d_2xn), "+r" (s_2xn), "+ecx" (i)
|
sl@0
|
605 |
: "m" (n));
|
sl@0
|
606 |
|
sl@0
|
607 |
i*=2;
|
sl@0
|
608 |
n+=5;
|
sl@0
|
609 |
d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
|
sl@0
|
610 |
i+=2;
|
sl@0
|
611 |
} else {
|
sl@0
|
612 |
i = 2;
|
sl@0
|
613 |
}
|
sl@0
|
614 |
for(;i<n*2-2;i+=2){
|
sl@0
|
615 |
d_2xn[i] = s_2xn[i] - ((s_2xn[i-1] + s_2xn[i+1]) >> 2);
|
sl@0
|
616 |
d_2xn[i-1] = s_2xn[i-1] + ((d_2xn[i] + d_2xn[i-2]) >> 1);
|
sl@0
|
617 |
}
|
sl@0
|
618 |
d_2xn[n*2-2] = s_2xn[n*2-2] - ((s_2xn[n*2-3] + s_2xn[n*2-1]) >> 2);
|
sl@0
|
619 |
d_2xn[n*2-3] = s_2xn[n*2-3] + ((d_2xn[n*2-2] + d_2xn[n*2-4]) >> 1);
|
sl@0
|
620 |
d_2xn[n*2-1] = s_2xn[n*2-1] + d_2xn[n*2-2];
|
sl@0
|
621 |
}
|
sl@0
|
622 |
}
|
sl@0
|
623 |
OIL_DEFINE_IMPL_FULL (synth_53_mmx, synth_53, OIL_IMPL_FLAG_MMX);
|
sl@0
|
624 |
#endif
|
sl@0
|
625 |
|
sl@0
|
626 |
|
sl@0
|
627 |
void
|
sl@0
|
628 |
mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
629 |
int16_t *s4_2, int n)
|
sl@0
|
630 |
{
|
sl@0
|
631 |
int shift = s4_2[1];
|
sl@0
|
632 |
|
sl@0
|
633 |
while (n&3) {
|
sl@0
|
634 |
int x;
|
sl@0
|
635 |
|
sl@0
|
636 |
x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
|
sl@0
|
637 |
x >>= s4_2[1];
|
sl@0
|
638 |
d1[0] = s1[0] + x;
|
sl@0
|
639 |
|
sl@0
|
640 |
d1++;
|
sl@0
|
641 |
s1++;
|
sl@0
|
642 |
s2++;
|
sl@0
|
643 |
n--;
|
sl@0
|
644 |
}
|
sl@0
|
645 |
if (n==0) return;
|
sl@0
|
646 |
|
sl@0
|
647 |
n>>=2;
|
sl@0
|
648 |
asm volatile ("\n"
|
sl@0
|
649 |
" movzwl 0(%0), %%ecx\n"
|
sl@0
|
650 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
651 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
652 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
653 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
654 |
" pshufw $0x00, %%mm6, %%mm6\n"
|
sl@0
|
655 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
656 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
657 |
" pshufw $0x44, %%mm5, %%mm5\n"
|
sl@0
|
658 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
659 |
: "ecx"
|
sl@0
|
660 |
);
|
sl@0
|
661 |
asm volatile ("\n"
|
sl@0
|
662 |
"1:\n"
|
sl@0
|
663 |
" movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
|
sl@0
|
664 |
" movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
|
sl@0
|
665 |
" pmullw %%mm7, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
666 |
" pmulhw %%mm7, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
|
sl@0
|
667 |
" movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
668 |
" punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
|
sl@0
|
669 |
" punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
|
sl@0
|
670 |
" movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
|
sl@0
|
671 |
|
sl@0
|
672 |
" movq 2(%2), %%mm2\n"
|
sl@0
|
673 |
" movq 2(%2), %%mm3\n"
|
sl@0
|
674 |
" pmullw %%mm6, %%mm2\n"
|
sl@0
|
675 |
" pmulhw %%mm6, %%mm3\n"
|
sl@0
|
676 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
677 |
" punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
|
sl@0
|
678 |
" punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
|
sl@0
|
679 |
" movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
|
sl@0
|
680 |
|
sl@0
|
681 |
" paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
|
sl@0
|
682 |
" paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
|
sl@0
|
683 |
|
sl@0
|
684 |
" paddd %%mm5, %%mm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
|
sl@0
|
685 |
" paddd %%mm5, %%mm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
|
sl@0
|
686 |
|
sl@0
|
687 |
" movd %4, %%mm4\n"
|
sl@0
|
688 |
" psrad %%mm4, %%mm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
|
sl@0
|
689 |
" psrad %%mm4, %%mm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
|
sl@0
|
690 |
|
sl@0
|
691 |
" packssdw %%mm1, %%mm0\n"
|
sl@0
|
692 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
693 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
694 |
" add $8, %0\n"
|
sl@0
|
695 |
" add $8, %1\n"
|
sl@0
|
696 |
" add $8, %2\n"
|
sl@0
|
697 |
" decl %3\n"
|
sl@0
|
698 |
" jnz 1b\n"
|
sl@0
|
699 |
" emms\n"
|
sl@0
|
700 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
701 |
: "r" (shift)
|
sl@0
|
702 |
);
|
sl@0
|
703 |
}
|
sl@0
|
704 |
OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
705 |
|
sl@0
|
706 |
#if 0
|
sl@0
|
707 |
void
|
sl@0
|
708 |
mas2_add_s16_lim_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
709 |
int16_t *s4_2, int n)
|
sl@0
|
710 |
{
|
sl@0
|
711 |
int shift = s4_2[1];
|
sl@0
|
712 |
|
sl@0
|
713 |
while (n&3) {
|
sl@0
|
714 |
int x;
|
sl@0
|
715 |
|
sl@0
|
716 |
x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
|
sl@0
|
717 |
x >>= s4_2[1];
|
sl@0
|
718 |
d1[0] = s1[0] + x;
|
sl@0
|
719 |
|
sl@0
|
720 |
d1++;
|
sl@0
|
721 |
s1++;
|
sl@0
|
722 |
s2++;
|
sl@0
|
723 |
n--;
|
sl@0
|
724 |
}
|
sl@0
|
725 |
if (n==0) return;
|
sl@0
|
726 |
|
sl@0
|
727 |
n>>=2;
|
sl@0
|
728 |
asm volatile ("\n"
|
sl@0
|
729 |
" movzwl 0(%0), %%ecx\n"
|
sl@0
|
730 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
731 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
732 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
733 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
734 |
" pshufw $0x00, %%mm6, %%mm6\n"
|
sl@0
|
735 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
736 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
737 |
" pshufw $0x44, %%mm5, %%mm5\n"
|
sl@0
|
738 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
739 |
: "ecx"
|
sl@0
|
740 |
);
|
sl@0
|
741 |
asm volatile ("\n"
|
sl@0
|
742 |
"1:\n"
|
sl@0
|
743 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
744 |
" paddq 2(%2), %%mm0\n"
|
sl@0
|
745 |
|
sl@0
|
746 |
" movd %4, %%mm4\n"
|
sl@0
|
747 |
" psraw %%mm4, %%mm0\n"
|
sl@0
|
748 |
|
sl@0
|
749 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
750 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
751 |
" add $8, %0\n"
|
sl@0
|
752 |
" add $8, %1\n"
|
sl@0
|
753 |
" add $8, %2\n"
|
sl@0
|
754 |
" decl %3\n"
|
sl@0
|
755 |
" jnz 1b\n"
|
sl@0
|
756 |
" emms\n"
|
sl@0
|
757 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
758 |
: "r" (shift)
|
sl@0
|
759 |
);
|
sl@0
|
760 |
}
|
sl@0
|
761 |
OIL_DEFINE_IMPL_FULL (mas2_add_s16_lim_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
762 |
#endif
|
sl@0
|
763 |
|
sl@0
|
764 |
void
|
sl@0
|
765 |
mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_4,
|
sl@0
|
766 |
int16_t *s4_2, int n)
|
sl@0
|
767 |
{
|
sl@0
|
768 |
int shift = s4_2[1];
|
sl@0
|
769 |
//int m;
|
sl@0
|
770 |
|
sl@0
|
771 |
//m = n&3;
|
sl@0
|
772 |
#if 1
|
sl@0
|
773 |
while (n&3) {
|
sl@0
|
774 |
int x;
|
sl@0
|
775 |
int i;
|
sl@0
|
776 |
|
sl@0
|
777 |
x = s4_2[0];
|
sl@0
|
778 |
for(i=0;i<4;i++){
|
sl@0
|
779 |
x += s2[i]*s3_4[i];
|
sl@0
|
780 |
}
|
sl@0
|
781 |
x >>= s4_2[1];
|
sl@0
|
782 |
d1[0] = s1[0] + x;
|
sl@0
|
783 |
|
sl@0
|
784 |
d1++;
|
sl@0
|
785 |
s1++;
|
sl@0
|
786 |
s2++;
|
sl@0
|
787 |
n--;
|
sl@0
|
788 |
}
|
sl@0
|
789 |
#endif
|
sl@0
|
790 |
if (n==0) return;
|
sl@0
|
791 |
|
sl@0
|
792 |
n>>=2;
|
sl@0
|
793 |
asm volatile ("\n"
|
sl@0
|
794 |
" movq 0(%0), %%mm7\n"
|
sl@0
|
795 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
796 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
797 |
" pshufw $0x44, %%mm5, %%mm5\n"
|
sl@0
|
798 |
:: "r" (s3_4), "r" (s4_2)
|
sl@0
|
799 |
: "ecx"
|
sl@0
|
800 |
);
|
sl@0
|
801 |
asm volatile ("\n"
|
sl@0
|
802 |
"1:\n"
|
sl@0
|
803 |
" movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
|
sl@0
|
804 |
" movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
|
sl@0
|
805 |
" pshufw $0x00, %%mm7, %%mm6\n"
|
sl@0
|
806 |
" pmullw %%mm6, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
807 |
" pmulhw %%mm6, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
|
sl@0
|
808 |
" movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
809 |
" punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
|
sl@0
|
810 |
" punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
|
sl@0
|
811 |
" movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
|
sl@0
|
812 |
|
sl@0
|
813 |
" movq 2(%2), %%mm2\n"
|
sl@0
|
814 |
" movq 2(%2), %%mm3\n"
|
sl@0
|
815 |
" pshufw $0x55, %%mm7, %%mm6\n"
|
sl@0
|
816 |
" pmullw %%mm6, %%mm2\n"
|
sl@0
|
817 |
" pmulhw %%mm6, %%mm3\n"
|
sl@0
|
818 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
819 |
" punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
|
sl@0
|
820 |
" punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
|
sl@0
|
821 |
" movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
|
sl@0
|
822 |
" paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
|
sl@0
|
823 |
" paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
|
sl@0
|
824 |
|
sl@0
|
825 |
" movq 4(%2), %%mm2\n"
|
sl@0
|
826 |
" movq 4(%2), %%mm3\n"
|
sl@0
|
827 |
" pshufw $0xaa, %%mm7, %%mm6\n"
|
sl@0
|
828 |
" pmullw %%mm6, %%mm2\n"
|
sl@0
|
829 |
" pmulhw %%mm6, %%mm3\n"
|
sl@0
|
830 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
831 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
832 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
833 |
" movq %%mm4, %%mm3\n"
|
sl@0
|
834 |
" paddd %%mm3, %%mm1\n"
|
sl@0
|
835 |
" paddd %%mm2, %%mm0\n"
|
sl@0
|
836 |
|
sl@0
|
837 |
" movq 6(%2), %%mm2\n"
|
sl@0
|
838 |
" movq 6(%2), %%mm3\n"
|
sl@0
|
839 |
" pshufw $0xff, %%mm7, %%mm6\n"
|
sl@0
|
840 |
" pmullw %%mm6, %%mm2\n"
|
sl@0
|
841 |
" pmulhw %%mm6, %%mm3\n"
|
sl@0
|
842 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
843 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
844 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
845 |
" movq %%mm4, %%mm3\n"
|
sl@0
|
846 |
" paddd %%mm3, %%mm1\n"
|
sl@0
|
847 |
" paddd %%mm2, %%mm0\n"
|
sl@0
|
848 |
|
sl@0
|
849 |
" paddd %%mm5, %%mm1\n"
|
sl@0
|
850 |
" paddd %%mm5, %%mm0\n"
|
sl@0
|
851 |
|
sl@0
|
852 |
" movd %4, %%mm4\n"
|
sl@0
|
853 |
" psrad %%mm4, %%mm1\n"
|
sl@0
|
854 |
" psrad %%mm4, %%mm0\n"
|
sl@0
|
855 |
|
sl@0
|
856 |
" packssdw %%mm1, %%mm0\n"
|
sl@0
|
857 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
858 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
859 |
" add $8, %0\n"
|
sl@0
|
860 |
" add $8, %1\n"
|
sl@0
|
861 |
" add $8, %2\n"
|
sl@0
|
862 |
" decl %3\n"
|
sl@0
|
863 |
" jnz 1b\n"
|
sl@0
|
864 |
" emms\n"
|
sl@0
|
865 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
866 |
: "r" (shift)
|
sl@0
|
867 |
);
|
sl@0
|
868 |
#if 0
|
sl@0
|
869 |
while (m) {
|
sl@0
|
870 |
int x;
|
sl@0
|
871 |
int i;
|
sl@0
|
872 |
|
sl@0
|
873 |
x = s4_2[0];
|
sl@0
|
874 |
for(i=0;i<4;i++){
|
sl@0
|
875 |
x += s2[i]*s3_4[i];
|
sl@0
|
876 |
}
|
sl@0
|
877 |
x >>= s4_2[1];
|
sl@0
|
878 |
d1[0] = s1[0] + x;
|
sl@0
|
879 |
|
sl@0
|
880 |
d1++;
|
sl@0
|
881 |
s1++;
|
sl@0
|
882 |
s2++;
|
sl@0
|
883 |
m--;
|
sl@0
|
884 |
}
|
sl@0
|
885 |
#endif
|
sl@0
|
886 |
}
|
sl@0
|
887 |
OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
888 |
|
sl@0
|
889 |
#if 0
|
sl@0
|
890 |
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
|
sl@0
|
891 |
void
|
sl@0
|
892 |
mas2_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
893 |
int16_t *s4_2, int n)
|
sl@0
|
894 |
{
|
sl@0
|
895 |
while (n&3) {
|
sl@0
|
896 |
int x;
|
sl@0
|
897 |
|
sl@0
|
898 |
x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
|
sl@0
|
899 |
x >>= s4_2[1];
|
sl@0
|
900 |
d1[0] = s1[0] + x;
|
sl@0
|
901 |
|
sl@0
|
902 |
d1++;
|
sl@0
|
903 |
s1++;
|
sl@0
|
904 |
s2++;
|
sl@0
|
905 |
n--;
|
sl@0
|
906 |
}
|
sl@0
|
907 |
if (n==0) return;
|
sl@0
|
908 |
|
sl@0
|
909 |
n>>=2;
|
sl@0
|
910 |
asm volatile ("\n"
|
sl@0
|
911 |
" movzwl 0(%0), %%ecx\n"
|
sl@0
|
912 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
913 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
914 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
915 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
916 |
" pshufw $0x00, %%mm6, %%mm6\n"
|
sl@0
|
917 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
918 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
919 |
" pshufw $0x00, %%mm5, %%mm5\n"
|
sl@0
|
920 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
921 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
922 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
923 |
: "ecx"
|
sl@0
|
924 |
);
|
sl@0
|
925 |
asm volatile ("\n"
|
sl@0
|
926 |
"1:\n"
|
sl@0
|
927 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
928 |
" pmullw %%mm7, %%mm0\n"
|
sl@0
|
929 |
" movq 2(%2), %%mm1\n"
|
sl@0
|
930 |
" pmullw %%mm6, %%mm1\n"
|
sl@0
|
931 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
932 |
" paddw %%mm5, %%mm0\n"
|
sl@0
|
933 |
" psraw %%mm4, %%mm0\n"
|
sl@0
|
934 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
935 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
936 |
" add $8, %0\n"
|
sl@0
|
937 |
" add $8, %1\n"
|
sl@0
|
938 |
" add $8, %2\n"
|
sl@0
|
939 |
" decl %3\n"
|
sl@0
|
940 |
" jnz 1b\n"
|
sl@0
|
941 |
" emms\n"
|
sl@0
|
942 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
943 |
);
|
sl@0
|
944 |
}
|
sl@0
|
945 |
OIL_DEFINE_IMPL_FULL (mas2_add_s16_mmx, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
946 |
#endif
|
sl@0
|
947 |
|
sl@0
|
948 |
|
sl@0
|
949 |
#if 0
|
sl@0
|
950 |
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
|
sl@0
|
951 |
void
|
sl@0
|
952 |
mas4_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
953 |
int16_t *s4_2, int n)
|
sl@0
|
954 |
{
|
sl@0
|
955 |
while (n&3) {
|
sl@0
|
956 |
int x;
|
sl@0
|
957 |
|
sl@0
|
958 |
x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1] +
|
sl@0
|
959 |
s2[2]*s3_2[2] + s2[2]*s3_2[2];
|
sl@0
|
960 |
x >>= s4_2[1];
|
sl@0
|
961 |
d1[0] = s1[0] + x;
|
sl@0
|
962 |
|
sl@0
|
963 |
d1++;
|
sl@0
|
964 |
s1++;
|
sl@0
|
965 |
s2++;
|
sl@0
|
966 |
n--;
|
sl@0
|
967 |
}
|
sl@0
|
968 |
if (n==0) return;
|
sl@0
|
969 |
|
sl@0
|
970 |
n>>=2;
|
sl@0
|
971 |
asm volatile ("\n"
|
sl@0
|
972 |
" movzwl 0(%0), %%ecx\n"
|
sl@0
|
973 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
974 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
975 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
976 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
977 |
" pshufw $0x00, %%mm6, %%mm6\n"
|
sl@0
|
978 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
979 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
980 |
" pshufw $0x00, %%mm5, %%mm5\n"
|
sl@0
|
981 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
982 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
983 |
" pshufw $0x00, %%mm4, %%mm4\n"
|
sl@0
|
984 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
985 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
986 |
" pshufw $0x00, %%mm3, %%mm3\n"
|
sl@0
|
987 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
988 |
" movd %%ecx, %%mm2\n"
|
sl@0
|
989 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
990 |
: "ecx"
|
sl@0
|
991 |
);
|
sl@0
|
992 |
asm volatile ("\n"
|
sl@0
|
993 |
"1:\n"
|
sl@0
|
994 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
995 |
" pmullw %%mm7, %%mm0\n"
|
sl@0
|
996 |
" movq 2(%2), %%mm1\n"
|
sl@0
|
997 |
" pmullw %%mm6, %%mm1\n"
|
sl@0
|
998 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
999 |
" movq 4(%2), %%mm1\n"
|
sl@0
|
1000 |
" pmullw %%mm5, %%mm1\n"
|
sl@0
|
1001 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1002 |
" movq 6(%2), %%mm1\n"
|
sl@0
|
1003 |
" pmullw %%mm4, %%mm1\n"
|
sl@0
|
1004 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1005 |
" paddw %%mm3, %%mm0\n"
|
sl@0
|
1006 |
" psraw %%mm2, %%mm0\n"
|
sl@0
|
1007 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1008 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1009 |
" add $8, %0\n"
|
sl@0
|
1010 |
" add $8, %1\n"
|
sl@0
|
1011 |
" add $8, %2\n"
|
sl@0
|
1012 |
" decl %3\n"
|
sl@0
|
1013 |
" jnz 1b\n"
|
sl@0
|
1014 |
" emms\n"
|
sl@0
|
1015 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1016 |
);
|
sl@0
|
1017 |
}
|
sl@0
|
1018 |
OIL_DEFINE_IMPL_FULL (mas4_add_s16_mmx, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1019 |
#endif
|
sl@0
|
1020 |
|
sl@0
|
1021 |
|
sl@0
|
1022 |
#if 0
|
sl@0
|
1023 |
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
|
sl@0
|
1024 |
void
|
sl@0
|
1025 |
mas8_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1026 |
int16_t *s4_2, int n)
|
sl@0
|
1027 |
{
|
sl@0
|
1028 |
while (n&3) {
|
sl@0
|
1029 |
int x;
|
sl@0
|
1030 |
int i;
|
sl@0
|
1031 |
|
sl@0
|
1032 |
x = s4_2[0];
|
sl@0
|
1033 |
for(i=0;i<8;i++){
|
sl@0
|
1034 |
x += s2[i]*s3_2[i];
|
sl@0
|
1035 |
}
|
sl@0
|
1036 |
x >>= s4_2[1];
|
sl@0
|
1037 |
d1[0] = s1[0] + x;
|
sl@0
|
1038 |
|
sl@0
|
1039 |
d1++;
|
sl@0
|
1040 |
s1++;
|
sl@0
|
1041 |
s2++;
|
sl@0
|
1042 |
n--;
|
sl@0
|
1043 |
}
|
sl@0
|
1044 |
if (n==0) return;
|
sl@0
|
1045 |
|
sl@0
|
1046 |
n>>=2;
|
sl@0
|
1047 |
asm volatile ("\n"
|
sl@0
|
1048 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1049 |
" movq 8(%0), %%mm7\n"
|
sl@0
|
1050 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1051 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
1052 |
" pshufw $0x00, %%mm3, %%mm3\n"
|
sl@0
|
1053 |
" pxor %%mm4, %%mm4\n"
|
sl@0
|
1054 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1055 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1056 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1057 |
: "ecx"
|
sl@0
|
1058 |
);
|
sl@0
|
1059 |
asm volatile ("\n"
|
sl@0
|
1060 |
"1:\n"
|
sl@0
|
1061 |
" pshufw $0x00, %%mm6, %%mm1\n"
|
sl@0
|
1062 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1063 |
" pmullw %%mm1, %%mm0\n"
|
sl@0
|
1064 |
" pshufw $0x55, %%mm6, %%mm2\n"
|
sl@0
|
1065 |
" movq 2(%2), %%mm1\n"
|
sl@0
|
1066 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1067 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1068 |
" pshufw $0xaa, %%mm6, %%mm2\n"
|
sl@0
|
1069 |
" movq 4(%2), %%mm1\n"
|
sl@0
|
1070 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1071 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1072 |
" pshufw $0xff, %%mm6, %%mm2\n"
|
sl@0
|
1073 |
" movq 6(%2), %%mm1\n"
|
sl@0
|
1074 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1075 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1076 |
|
sl@0
|
1077 |
" pshufw $0x00, %%mm7, %%mm2\n"
|
sl@0
|
1078 |
" movq 8(%2), %%mm1\n"
|
sl@0
|
1079 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1080 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1081 |
" pshufw $0x55, %%mm7, %%mm2\n"
|
sl@0
|
1082 |
" movq 10(%2), %%mm1\n"
|
sl@0
|
1083 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1084 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1085 |
" pshufw $0xaa, %%mm7, %%mm2\n"
|
sl@0
|
1086 |
" movq 12(%2), %%mm1\n"
|
sl@0
|
1087 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1088 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1089 |
" pshufw $0xff, %%mm7, %%mm2\n"
|
sl@0
|
1090 |
" movq 14(%2), %%mm1\n"
|
sl@0
|
1091 |
" pmullw %%mm2, %%mm1\n"
|
sl@0
|
1092 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1093 |
|
sl@0
|
1094 |
" paddw %%mm3, %%mm0\n"
|
sl@0
|
1095 |
" psraw %%mm4, %%mm0\n"
|
sl@0
|
1096 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1097 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1098 |
" add $8, %0\n"
|
sl@0
|
1099 |
" add $8, %1\n"
|
sl@0
|
1100 |
" add $8, %2\n"
|
sl@0
|
1101 |
" decl %3\n"
|
sl@0
|
1102 |
" jnz 1b\n"
|
sl@0
|
1103 |
" emms\n"
|
sl@0
|
1104 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1105 |
);
|
sl@0
|
1106 |
}
|
sl@0
|
1107 |
OIL_DEFINE_IMPL_FULL (mas8_add_s16_mmx, mas8_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1108 |
#endif
|
sl@0
|
1109 |
|
sl@0
|
1110 |
|
sl@0
|
1111 |
void
|
sl@0
|
1112 |
mas4_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1113 |
int16_t *s4_2, int n)
|
sl@0
|
1114 |
{
|
sl@0
|
1115 |
if (n==0) return;
|
sl@0
|
1116 |
asm volatile ("\n"
|
sl@0
|
1117 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1118 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1119 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
1120 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1121 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1122 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1123 |
: "ecx"
|
sl@0
|
1124 |
);
|
sl@0
|
1125 |
asm volatile ("\n"
|
sl@0
|
1126 |
"1:\n"
|
sl@0
|
1127 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1128 |
" pmaddwd %%mm6, %%mm0\n"
|
sl@0
|
1129 |
" pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
|
sl@0
|
1130 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1131 |
" paddd %%mm3, %%mm0\n"
|
sl@0
|
1132 |
" psrad %%mm4, %%mm0\n"
|
sl@0
|
1133 |
" movd %%mm0, %%eax\n"
|
sl@0
|
1134 |
" addw 0(%1), %%ax\n"
|
sl@0
|
1135 |
" movw %%ax, 0(%0)\n"
|
sl@0
|
1136 |
" add $2, %0\n"
|
sl@0
|
1137 |
" add $2, %1\n"
|
sl@0
|
1138 |
" add $2, %2\n"
|
sl@0
|
1139 |
" decl %3\n"
|
sl@0
|
1140 |
" jnz 1b\n"
|
sl@0
|
1141 |
" emms\n"
|
sl@0
|
1142 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1143 |
:
|
sl@0
|
1144 |
: "eax"
|
sl@0
|
1145 |
);
|
sl@0
|
1146 |
}
|
sl@0
|
1147 |
OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1148 |
|
sl@0
|
1149 |
void
|
sl@0
|
1150 |
mas4_add_s16_pmaddwd_2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1151 |
int16_t *s4_2, int n)
|
sl@0
|
1152 |
{
|
sl@0
|
1153 |
if (n==0) return;
|
sl@0
|
1154 |
asm volatile ("\n"
|
sl@0
|
1155 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1156 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1157 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
1158 |
" pshufw $0x44, %%mm3, %%mm3\n" // 01 00 01 00
|
sl@0
|
1159 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1160 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1161 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1162 |
: "ecx"
|
sl@0
|
1163 |
);
|
sl@0
|
1164 |
if (n&1) {
|
sl@0
|
1165 |
asm volatile ("\n"
|
sl@0
|
1166 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1167 |
" pmaddwd %%mm6, %%mm0\n"
|
sl@0
|
1168 |
" pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
|
sl@0
|
1169 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1170 |
" paddd %%mm3, %%mm0\n"
|
sl@0
|
1171 |
" psrad %%mm4, %%mm0\n"
|
sl@0
|
1172 |
" movd %%mm0, %%eax\n"
|
sl@0
|
1173 |
" addw 0(%1), %%ax\n"
|
sl@0
|
1174 |
" movw %%ax, 0(%0)\n"
|
sl@0
|
1175 |
" add $2, %0\n"
|
sl@0
|
1176 |
" add $2, %1\n"
|
sl@0
|
1177 |
" add $2, %2\n"
|
sl@0
|
1178 |
" decl %3\n"
|
sl@0
|
1179 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1180 |
:
|
sl@0
|
1181 |
: "eax"
|
sl@0
|
1182 |
);
|
sl@0
|
1183 |
}
|
sl@0
|
1184 |
n>>=1;
|
sl@0
|
1185 |
asm volatile ("\n"
|
sl@0
|
1186 |
"1:\n"
|
sl@0
|
1187 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1188 |
" pmaddwd %%mm6, %%mm0\n"
|
sl@0
|
1189 |
" movq 2(%2), %%mm2\n"
|
sl@0
|
1190 |
" pmaddwd %%mm6, %%mm2\n"
|
sl@0
|
1191 |
|
sl@0
|
1192 |
" movq %%mm0, %%mm1\n"
|
sl@0
|
1193 |
" punpckhdq %%mm2, %%mm0\n"
|
sl@0
|
1194 |
" punpckldq %%mm2, %%mm1\n"
|
sl@0
|
1195 |
|
sl@0
|
1196 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1197 |
" paddd %%mm3, %%mm0\n"
|
sl@0
|
1198 |
" psrad %%mm4, %%mm0\n"
|
sl@0
|
1199 |
" pshufw $0xd8, %%mm0, %%mm0\n" // 11 01 10 00
|
sl@0
|
1200 |
|
sl@0
|
1201 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1202 |
" movd %%mm0, 0(%0)\n"
|
sl@0
|
1203 |
" add $4, %0\n"
|
sl@0
|
1204 |
" add $4, %1\n"
|
sl@0
|
1205 |
" add $4, %2\n"
|
sl@0
|
1206 |
" decl %3\n"
|
sl@0
|
1207 |
" jnz 1b\n"
|
sl@0
|
1208 |
" emms\n"
|
sl@0
|
1209 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1210 |
:
|
sl@0
|
1211 |
: "eax"
|
sl@0
|
1212 |
);
|
sl@0
|
1213 |
}
|
sl@0
|
1214 |
OIL_DEFINE_IMPL_FULL (mas4_add_s16_pmaddwd_2, mas4_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1215 |
|
sl@0
|
1216 |
void
|
sl@0
|
1217 |
mas8_add_s16_pmaddwd (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1218 |
int16_t *s4_2, int n)
|
sl@0
|
1219 |
{
|
sl@0
|
1220 |
if (n==0) return;
|
sl@0
|
1221 |
asm volatile ("\n"
|
sl@0
|
1222 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1223 |
" movq 8(%0), %%mm7\n"
|
sl@0
|
1224 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1225 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
1226 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1227 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1228 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1229 |
: "ecx"
|
sl@0
|
1230 |
);
|
sl@0
|
1231 |
asm volatile ("\n"
|
sl@0
|
1232 |
"1:\n"
|
sl@0
|
1233 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1234 |
" pmaddwd %%mm6, %%mm0\n"
|
sl@0
|
1235 |
" movq 8(%2), %%mm1\n"
|
sl@0
|
1236 |
" pmaddwd %%mm7, %%mm1\n"
|
sl@0
|
1237 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1238 |
" pshufw $0xee, %%mm0, %%mm1\n"
|
sl@0
|
1239 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1240 |
" paddd %%mm3, %%mm0\n"
|
sl@0
|
1241 |
" psrad %%mm4, %%mm0\n"
|
sl@0
|
1242 |
" movd %%mm0, %%eax\n"
|
sl@0
|
1243 |
" addw 0(%1), %%ax\n"
|
sl@0
|
1244 |
" movw %%ax, 0(%0)\n"
|
sl@0
|
1245 |
" add $2, %0\n"
|
sl@0
|
1246 |
" add $2, %1\n"
|
sl@0
|
1247 |
" add $2, %2\n"
|
sl@0
|
1248 |
" decl %3\n"
|
sl@0
|
1249 |
" jnz 1b\n"
|
sl@0
|
1250 |
" emms\n"
|
sl@0
|
1251 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1252 |
:
|
sl@0
|
1253 |
: "eax"
|
sl@0
|
1254 |
);
|
sl@0
|
1255 |
}
|
sl@0
|
1256 |
OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd, mas8_add_s16, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1257 |
|
sl@0
|
1258 |
|
sl@0
|
1259 |
|
sl@0
|
1260 |
#if 0
|
sl@0
|
1261 |
void
|
sl@0
|
1262 |
mas8_add_s16_pmaddwd2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1263 |
int16_t *s4_2, int n)
|
sl@0
|
1264 |
{
|
sl@0
|
1265 |
while (n&3) {
|
sl@0
|
1266 |
int x;
|
sl@0
|
1267 |
int i;
|
sl@0
|
1268 |
|
sl@0
|
1269 |
x = s4_2[0];
|
sl@0
|
1270 |
for(i=0;i<8;i++){
|
sl@0
|
1271 |
x += s2[i]*s3_2[i];
|
sl@0
|
1272 |
}
|
sl@0
|
1273 |
x >>= s4_2[1];
|
sl@0
|
1274 |
d1[0] = s1[0] + x;
|
sl@0
|
1275 |
|
sl@0
|
1276 |
d1++;
|
sl@0
|
1277 |
s1++;
|
sl@0
|
1278 |
s2++;
|
sl@0
|
1279 |
n--;
|
sl@0
|
1280 |
}
|
sl@0
|
1281 |
if (n==0) return;
|
sl@0
|
1282 |
|
sl@0
|
1283 |
n>>=2;
|
sl@0
|
1284 |
asm volatile ("\n"
|
sl@0
|
1285 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1286 |
" movq 8(%0), %%mm7\n"
|
sl@0
|
1287 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1288 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
1289 |
" pshufw $0x00, %%mm5, %%mm5\n"
|
sl@0
|
1290 |
" pxor %%mm4, %%mm4\n"
|
sl@0
|
1291 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1292 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1293 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1294 |
: "ecx"
|
sl@0
|
1295 |
);
|
sl@0
|
1296 |
asm volatile ("\n"
|
sl@0
|
1297 |
"1:\n"
|
sl@0
|
1298 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1299 |
" pmaddwd %%mm6, %%mm0\n"
|
sl@0
|
1300 |
" movq 8(%2), %%mm1\n"
|
sl@0
|
1301 |
" pmaddwd %%mm7, %%mm1\n"
|
sl@0
|
1302 |
" paddd %%mm1, %%mm0\n"
|
sl@0
|
1303 |
" pshufw $0xee, %%mm0, %%mm1\n"
|
sl@0
|
1304 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1305 |
|
sl@0
|
1306 |
" movq 2(%2), %%mm2\n"
|
sl@0
|
1307 |
" pmaddwd %%mm6, %%mm2\n"
|
sl@0
|
1308 |
" movq 10(%2), %%mm3\n"
|
sl@0
|
1309 |
" pmaddwd %%mm7, %%mm3\n"
|
sl@0
|
1310 |
" paddd %%mm3, %%mm2\n"
|
sl@0
|
1311 |
" pshufw $0xee, %%mm2, %%mm3\n"
|
sl@0
|
1312 |
" paddw %%mm3, %%mm2\n"
|
sl@0
|
1313 |
" pextrw $0, %%mm2, %%eax\n"
|
sl@0
|
1314 |
" pinsrw $1, %%eax, %%mm0\n"
|
sl@0
|
1315 |
|
sl@0
|
1316 |
" movq 4(%2), %%mm2\n"
|
sl@0
|
1317 |
" pmaddwd %%mm6, %%mm2\n"
|
sl@0
|
1318 |
" movq 12(%2), %%mm3\n"
|
sl@0
|
1319 |
" pmaddwd %%mm7, %%mm3\n"
|
sl@0
|
1320 |
" paddd %%mm3, %%mm2\n"
|
sl@0
|
1321 |
" pshufw $0xee, %%mm2, %%mm3\n"
|
sl@0
|
1322 |
" paddw %%mm3, %%mm2\n"
|
sl@0
|
1323 |
" pextrw $0, %%mm2, %%eax\n"
|
sl@0
|
1324 |
" pinsrw $2, %%eax, %%mm0\n"
|
sl@0
|
1325 |
|
sl@0
|
1326 |
" movq 6(%2), %%mm2\n"
|
sl@0
|
1327 |
" pmaddwd %%mm6, %%mm2\n"
|
sl@0
|
1328 |
" movq 14(%2), %%mm3\n"
|
sl@0
|
1329 |
" pmaddwd %%mm7, %%mm3\n"
|
sl@0
|
1330 |
" paddd %%mm3, %%mm2\n"
|
sl@0
|
1331 |
" pshufw $0xee, %%mm2, %%mm3\n"
|
sl@0
|
1332 |
" paddw %%mm3, %%mm2\n"
|
sl@0
|
1333 |
" pextrw $0, %%mm2, %%eax\n"
|
sl@0
|
1334 |
" pinsrw $3, %%eax, %%mm0\n"
|
sl@0
|
1335 |
|
sl@0
|
1336 |
" paddw %%mm5, %%mm0\n"
|
sl@0
|
1337 |
" psraw %%mm4, %%mm0\n"
|
sl@0
|
1338 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1339 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1340 |
" add $8, %0\n"
|
sl@0
|
1341 |
" add $8, %1\n"
|
sl@0
|
1342 |
" add $8, %2\n"
|
sl@0
|
1343 |
" decl %3\n"
|
sl@0
|
1344 |
" jnz 1b\n"
|
sl@0
|
1345 |
" emms\n"
|
sl@0
|
1346 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1347 |
:
|
sl@0
|
1348 |
: "eax"
|
sl@0
|
1349 |
);
|
sl@0
|
1350 |
}
|
sl@0
|
1351 |
OIL_DEFINE_IMPL_FULL (mas8_add_s16_pmaddwd2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
|
sl@0
|
1352 |
#endif
|
sl@0
|
1353 |
|
sl@0
|
1354 |
#if 0
|
sl@0
|
1355 |
/* This only does 16-bit intermediates, whereas the ref specifies 32-bit */
|
sl@0
|
1356 |
void
|
sl@0
|
1357 |
mas8_add_s16_sse2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
|
sl@0
|
1358 |
int16_t *s4_2, int n)
|
sl@0
|
1359 |
{
|
sl@0
|
1360 |
asm volatile ("\n"
|
sl@0
|
1361 |
" movq 0(%0), %%mm6\n"
|
sl@0
|
1362 |
" movq 8(%0), %%mm7\n"
|
sl@0
|
1363 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1364 |
" movd %%ecx, %%mm3\n"
|
sl@0
|
1365 |
" pshufw $0x00, %%mm3, %%mm3\n"
|
sl@0
|
1366 |
" pxor %%mm4, %%mm4\n"
|
sl@0
|
1367 |
" movzwl 2(%1), %%ecx\n"
|
sl@0
|
1368 |
" movd %%ecx, %%mm4\n"
|
sl@0
|
1369 |
:: "r" (s3_2), "r" (s4_2)
|
sl@0
|
1370 |
: "ecx"
|
sl@0
|
1371 |
);
|
sl@0
|
1372 |
asm volatile ("\n"
|
sl@0
|
1373 |
"1:\n"
|
sl@0
|
1374 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1375 |
" pmullw %%mm6, %%mm0\n"
|
sl@0
|
1376 |
" movq 8(%2), %%mm1\n"
|
sl@0
|
1377 |
" pmullw %%mm7, %%mm1\n"
|
sl@0
|
1378 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1379 |
" pshufw $0xee, %%mm0, %%mm1\n"
|
sl@0
|
1380 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1381 |
" pshufw $0x01, %%mm0, %%mm1\n"
|
sl@0
|
1382 |
" paddw %%mm1, %%mm0\n"
|
sl@0
|
1383 |
" paddw %%mm3, %%mm0\n"
|
sl@0
|
1384 |
" psraw %%mm4, %%mm0\n"
|
sl@0
|
1385 |
" movd %%mm0, %%eax\n"
|
sl@0
|
1386 |
" addw 0(%1), %%ax\n"
|
sl@0
|
1387 |
" movw %%ax, 0(%0)\n"
|
sl@0
|
1388 |
" add $2, %0\n"
|
sl@0
|
1389 |
" add $2, %1\n"
|
sl@0
|
1390 |
" add $2, %2\n"
|
sl@0
|
1391 |
" decl %3\n"
|
sl@0
|
1392 |
" jnz 1b\n"
|
sl@0
|
1393 |
" emms\n"
|
sl@0
|
1394 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1395 |
:
|
sl@0
|
1396 |
: "eax"
|
sl@0
|
1397 |
);
|
sl@0
|
1398 |
}
|
sl@0
|
1399 |
OIL_DEFINE_IMPL_FULL (mas8_add_s16_sse2, mas8_add_s16, OIL_IMPL_FLAG_SSE);
|
sl@0
|
1400 |
#endif
|
sl@0
|
1401 |
|
sl@0
|
1402 |
void
|
sl@0
|
1403 |
mas2_across_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
|
sl@0
|
1404 |
int16_t *s4_2, int16_t *s5_2, int n)
|
sl@0
|
1405 |
{
|
sl@0
|
1406 |
int shift = s5_2[1];
|
sl@0
|
1407 |
|
sl@0
|
1408 |
while (n&3) {
|
sl@0
|
1409 |
int x;
|
sl@0
|
1410 |
|
sl@0
|
1411 |
x = s5_2[0] + s2[0]*s4_2[0] + s3[0]*s4_2[1];
|
sl@0
|
1412 |
x >>= s5_2[1];
|
sl@0
|
1413 |
d1[0] = s1[0] + x;
|
sl@0
|
1414 |
|
sl@0
|
1415 |
d1++;
|
sl@0
|
1416 |
s1++;
|
sl@0
|
1417 |
s2++;
|
sl@0
|
1418 |
s3++;
|
sl@0
|
1419 |
n--;
|
sl@0
|
1420 |
}
|
sl@0
|
1421 |
if (n==0) return;
|
sl@0
|
1422 |
|
sl@0
|
1423 |
n>>=2;
|
sl@0
|
1424 |
if (n==0) return;
|
sl@0
|
1425 |
asm volatile ("\n"
|
sl@0
|
1426 |
" movzwl 0(%0), %%ecx\n"
|
sl@0
|
1427 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
1428 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
1429 |
" movzwl 2(%0), %%ecx\n"
|
sl@0
|
1430 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
1431 |
" pshufw $0x00, %%mm6, %%mm6\n"
|
sl@0
|
1432 |
" movzwl 0(%1), %%ecx\n"
|
sl@0
|
1433 |
" movd %%ecx, %%mm5\n"
|
sl@0
|
1434 |
" pshufw $0x44, %%mm5, %%mm5\n"
|
sl@0
|
1435 |
:: "r" (s4_2), "r" (s5_2)
|
sl@0
|
1436 |
: "ecx"
|
sl@0
|
1437 |
);
|
sl@0
|
1438 |
asm volatile ("\n"
|
sl@0
|
1439 |
"1:\n"
|
sl@0
|
1440 |
" movq 0(%2), %%mm0\n" // mm0 = s0, s1, s2, s3
|
sl@0
|
1441 |
" movq 0(%2), %%mm1\n" // mm1 = s0, s1, s2, s3
|
sl@0
|
1442 |
" pmullw %%mm7, %%mm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
1443 |
" pmulhw %%mm7, %%mm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
|
sl@0
|
1444 |
" movq %%mm0, %%mm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
|
sl@0
|
1445 |
" punpcklwd %%mm1, %%mm0\n" // mm0 = s0*a0, s1*a0
|
sl@0
|
1446 |
" punpckhwd %%mm1, %%mm2\n" // mm2 = s2*a0, s3*a0
|
sl@0
|
1447 |
" movq %%mm2, %%mm1\n" // mm1 = s2*a0, s3*a0
|
sl@0
|
1448 |
|
sl@0
|
1449 |
" movq 0(%3), %%mm2\n"
|
sl@0
|
1450 |
" movq 0(%3), %%mm3\n"
|
sl@0
|
1451 |
" pmullw %%mm6, %%mm2\n"
|
sl@0
|
1452 |
" pmulhw %%mm6, %%mm3\n"
|
sl@0
|
1453 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1454 |
" punpcklwd %%mm3, %%mm2\n" // mm2 = s1*a1, s2*a1
|
sl@0
|
1455 |
" punpckhwd %%mm3, %%mm4\n" // mm4 = s3*a1, s4*a1
|
sl@0
|
1456 |
" movq %%mm4, %%mm3\n" // mm3 = s3*a1, s4*a1
|
sl@0
|
1457 |
|
sl@0
|
1458 |
" paddd %%mm3, %%mm1\n" // mm1 = s2*a0 + s3*a1, ...
|
sl@0
|
1459 |
" paddd %%mm2, %%mm0\n" // mm0 = s0*a0 + s1*a1, ...
|
sl@0
|
1460 |
|
sl@0
|
1461 |
" paddd %%mm5, %%mm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
|
sl@0
|
1462 |
" paddd %%mm5, %%mm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
|
sl@0
|
1463 |
|
sl@0
|
1464 |
" movd %5, %%mm4\n"
|
sl@0
|
1465 |
" psrad %%mm4, %%mm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
|
sl@0
|
1466 |
" psrad %%mm4, %%mm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
|
sl@0
|
1467 |
|
sl@0
|
1468 |
" packssdw %%mm1, %%mm0\n"
|
sl@0
|
1469 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1470 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1471 |
" add $8, %0\n"
|
sl@0
|
1472 |
" add $8, %1\n"
|
sl@0
|
1473 |
" add $8, %2\n"
|
sl@0
|
1474 |
" add $8, %3\n"
|
sl@0
|
1475 |
" decl %4\n"
|
sl@0
|
1476 |
" jnz 1b\n"
|
sl@0
|
1477 |
" emms\n"
|
sl@0
|
1478 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+m" (n)
|
sl@0
|
1479 |
: "r" (shift)
|
sl@0
|
1480 |
);
|
sl@0
|
1481 |
}
|
sl@0
|
1482 |
OIL_DEFINE_IMPL_FULL (mas2_across_add_s16_mmx, mas2_across_add_s16,
|
sl@0
|
1483 |
OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1484 |
|
sl@0
|
1485 |
void
|
sl@0
|
1486 |
add_const_rshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2_2, int n)
|
sl@0
|
1487 |
{
|
sl@0
|
1488 |
while(n&3) {
|
sl@0
|
1489 |
d1[0] = (s1[0] + s2_2[0])>>s2_2[1];
|
sl@0
|
1490 |
d1++;
|
sl@0
|
1491 |
s1++;
|
sl@0
|
1492 |
n--;
|
sl@0
|
1493 |
}
|
sl@0
|
1494 |
n>>=2;
|
sl@0
|
1495 |
if (n==0) return;
|
sl@0
|
1496 |
asm volatile ("\n"
|
sl@0
|
1497 |
" movzwl 0(%2), %%ecx\n"
|
sl@0
|
1498 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
1499 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
1500 |
" movzwl 2(%2), %%ecx\n"
|
sl@0
|
1501 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
1502 |
"1:\n"
|
sl@0
|
1503 |
" movq 0(%1), %%mm0\n"
|
sl@0
|
1504 |
" paddsw %%mm7, %%mm0\n"
|
sl@0
|
1505 |
" psraw %%mm6, %%mm0\n"
|
sl@0
|
1506 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1507 |
" add $8, %0\n"
|
sl@0
|
1508 |
" add $8, %1\n"
|
sl@0
|
1509 |
" decl %3\n"
|
sl@0
|
1510 |
" jnz 1b\n"
|
sl@0
|
1511 |
" emms\n"
|
sl@0
|
1512 |
: "+r" (d1), "+r" (s1), "+r" (s2_2), "+r" (n)
|
sl@0
|
1513 |
:
|
sl@0
|
1514 |
: "ecx"
|
sl@0
|
1515 |
);
|
sl@0
|
1516 |
|
sl@0
|
1517 |
}
|
sl@0
|
1518 |
OIL_DEFINE_IMPL_FULL (add_const_rshift_s16_mmx, add_const_rshift_s16,
|
sl@0
|
1519 |
OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1520 |
|
sl@0
|
1521 |
void
|
sl@0
|
1522 |
multiply_and_add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3, int n)
|
sl@0
|
1523 |
{
|
sl@0
|
1524 |
while(n&3) {
|
sl@0
|
1525 |
d1[0] = s1[0] + s2[0]*s3[0];
|
sl@0
|
1526 |
d1++;
|
sl@0
|
1527 |
s1++;
|
sl@0
|
1528 |
s2++;
|
sl@0
|
1529 |
s3++;
|
sl@0
|
1530 |
n--;
|
sl@0
|
1531 |
}
|
sl@0
|
1532 |
n>>=2;
|
sl@0
|
1533 |
if (n==0) return;
|
sl@0
|
1534 |
asm volatile ("\n"
|
sl@0
|
1535 |
"1:\n"
|
sl@0
|
1536 |
" movq 0(%2), %%mm0\n"
|
sl@0
|
1537 |
" pmullw 0(%3), %%mm0\n"
|
sl@0
|
1538 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1539 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1540 |
" add $8, %0\n"
|
sl@0
|
1541 |
" add $8, %1\n"
|
sl@0
|
1542 |
" add $8, %2\n"
|
sl@0
|
1543 |
" add $8, %3\n"
|
sl@0
|
1544 |
" decl %4\n"
|
sl@0
|
1545 |
" jnz 1b\n"
|
sl@0
|
1546 |
" emms\n"
|
sl@0
|
1547 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
|
sl@0
|
1548 |
);
|
sl@0
|
1549 |
|
sl@0
|
1550 |
}
|
sl@0
|
1551 |
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_mmx, multiply_and_add_s16,
|
sl@0
|
1552 |
OIL_IMPL_FLAG_MMX);
|
sl@0
|
1553 |
|
sl@0
|
1554 |
void
|
sl@0
|
1555 |
multiply_and_add_s16_u8_mmx(int16_t *d1, int16_t *s1, int16_t *s2,
|
sl@0
|
1556 |
uint8_t *s3, int n)
|
sl@0
|
1557 |
{
|
sl@0
|
1558 |
while(n&3) {
|
sl@0
|
1559 |
d1[0] = s1[0] + s2[0]*s3[0];
|
sl@0
|
1560 |
d1++;
|
sl@0
|
1561 |
s1++;
|
sl@0
|
1562 |
s2++;
|
sl@0
|
1563 |
s3++;
|
sl@0
|
1564 |
n--;
|
sl@0
|
1565 |
}
|
sl@0
|
1566 |
n>>=2;
|
sl@0
|
1567 |
if (n==0) return;
|
sl@0
|
1568 |
asm volatile ("\n"
|
sl@0
|
1569 |
" pxor %%mm7, %%mm7\n"
|
sl@0
|
1570 |
"1:\n"
|
sl@0
|
1571 |
" movd 0(%3), %%mm0\n"
|
sl@0
|
1572 |
" punpcklbw %%mm7, %%mm0\n"
|
sl@0
|
1573 |
" pmullw 0(%2), %%mm0\n"
|
sl@0
|
1574 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1575 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1576 |
" add $8, %0\n"
|
sl@0
|
1577 |
" add $8, %1\n"
|
sl@0
|
1578 |
" add $8, %2\n"
|
sl@0
|
1579 |
" add $4, %3\n"
|
sl@0
|
1580 |
" decl %4\n"
|
sl@0
|
1581 |
" jnz 1b\n"
|
sl@0
|
1582 |
" emms\n"
|
sl@0
|
1583 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
|
sl@0
|
1584 |
);
|
sl@0
|
1585 |
|
sl@0
|
1586 |
}
|
sl@0
|
1587 |
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8,
|
sl@0
|
1588 |
OIL_IMPL_FLAG_MMX);
|
sl@0
|
1589 |
|
sl@0
|
1590 |
void
|
sl@0
|
1591 |
multiply_and_add_s16_u8_mmx_2(int16_t *d1, int16_t *s1, int16_t *s2,
|
sl@0
|
1592 |
uint8_t *s3, int n)
|
sl@0
|
1593 |
{
|
sl@0
|
1594 |
while(n&7) {
|
sl@0
|
1595 |
d1[0] = s1[0] + s2[0]*s3[0];
|
sl@0
|
1596 |
d1++;
|
sl@0
|
1597 |
s1++;
|
sl@0
|
1598 |
s2++;
|
sl@0
|
1599 |
s3++;
|
sl@0
|
1600 |
n--;
|
sl@0
|
1601 |
}
|
sl@0
|
1602 |
n>>=3;
|
sl@0
|
1603 |
if (n==0) return;
|
sl@0
|
1604 |
asm volatile ("\n"
|
sl@0
|
1605 |
" pxor %%mm7, %%mm7\n"
|
sl@0
|
1606 |
"1:\n"
|
sl@0
|
1607 |
" movd 0(%3), %%mm0\n"
|
sl@0
|
1608 |
" punpcklbw %%mm7, %%mm0\n"
|
sl@0
|
1609 |
" movd 4(%3), %%mm1\n"
|
sl@0
|
1610 |
" pmullw 0(%2), %%mm0\n"
|
sl@0
|
1611 |
" punpcklbw %%mm7, %%mm1\n"
|
sl@0
|
1612 |
" paddw 0(%1), %%mm0\n"
|
sl@0
|
1613 |
" pmullw 8(%2), %%mm1\n"
|
sl@0
|
1614 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1615 |
" paddw 8(%1), %%mm1\n"
|
sl@0
|
1616 |
" movq %%mm1, 8(%0)\n"
|
sl@0
|
1617 |
|
sl@0
|
1618 |
" add $16, %0\n"
|
sl@0
|
1619 |
" add $16, %1\n"
|
sl@0
|
1620 |
" add $16, %2\n"
|
sl@0
|
1621 |
" add $8, %3\n"
|
sl@0
|
1622 |
" decl %4\n"
|
sl@0
|
1623 |
" jnz 1b\n"
|
sl@0
|
1624 |
" emms\n"
|
sl@0
|
1625 |
: "+r" (d1), "+r" (s1), "+r" (s2), "+r" (s3), "+r" (n)
|
sl@0
|
1626 |
);
|
sl@0
|
1627 |
|
sl@0
|
1628 |
}
|
sl@0
|
1629 |
OIL_DEFINE_IMPL_FULL (multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8,
|
sl@0
|
1630 |
OIL_IMPL_FLAG_MMX);
|
sl@0
|
1631 |
|
sl@0
|
1632 |
void
|
sl@0
|
1633 |
multiply_and_acc_12xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
|
sl@0
|
1634 |
int ss1, uint8_t *s2, int ss2, int n)
|
sl@0
|
1635 |
{
|
sl@0
|
1636 |
if (n==0) return;
|
sl@0
|
1637 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1638 |
" pxor %%mm7, %%mm7\n"
|
sl@0
|
1639 |
"1:\n"
|
sl@0
|
1640 |
" movd 0(%2), %%mm0\n"
|
sl@0
|
1641 |
" punpcklbw %%mm7, %%mm0\n"
|
sl@0
|
1642 |
" pmullw 0(%1), %%mm0\n"
|
sl@0
|
1643 |
" paddw 0(%0), %%mm0\n"
|
sl@0
|
1644 |
" movq %%mm0, 0(%0)\n"
|
sl@0
|
1645 |
" movd 4(%2), %%mm1\n"
|
sl@0
|
1646 |
" punpcklbw %%mm7, %%mm1\n"
|
sl@0
|
1647 |
" pmullw 8(%1), %%mm1\n"
|
sl@0
|
1648 |
" paddw 8(%0), %%mm1\n"
|
sl@0
|
1649 |
" movq %%mm1, 8(%0)\n"
|
sl@0
|
1650 |
" movd 8(%2), %%mm2\n"
|
sl@0
|
1651 |
" punpcklbw %%mm7, %%mm2\n"
|
sl@0
|
1652 |
" pmullw 16(%1), %%mm2\n"
|
sl@0
|
1653 |
" paddw 16(%0), %%mm2\n"
|
sl@0
|
1654 |
" movq %%mm2, 16(%0)\n"
|
sl@0
|
1655 |
|
sl@0
|
1656 |
" addl %4, %0\n"
|
sl@0
|
1657 |
" addl %5, %1\n"
|
sl@0
|
1658 |
" addl %6, %2\n"
|
sl@0
|
1659 |
" decl %3\n"
|
sl@0
|
1660 |
" jnz 1b\n"
|
sl@0
|
1661 |
" emms\n"
|
sl@0
|
1662 |
: "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
|
sl@0
|
1663 |
: "m" (is1), "m" (ss1), "m" (ss2)
|
sl@0
|
1664 |
);
|
sl@0
|
1665 |
}
|
sl@0
|
1666 |
OIL_DEFINE_IMPL_FULL (multiply_and_acc_12xn_s16_u8_mmx,
|
sl@0
|
1667 |
multiply_and_acc_12xn_s16_u8, OIL_IMPL_FLAG_MMX);
|
sl@0
|
1668 |
|
sl@0
|
1669 |
#ifdef ENABLE_BROKEN_IMPLS
|
sl@0
|
1670 |
void
|
sl@0
|
1671 |
mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
|
sl@0
|
1672 |
int16_t *s3_4, int16_t *s4_2, int n)
|
sl@0
|
1673 |
{
|
sl@0
|
1674 |
int16_t *s2_nx4_off;
|
sl@0
|
1675 |
|
sl@0
|
1676 |
while (n&3) {
|
sl@0
|
1677 |
int x;
|
sl@0
|
1678 |
int j;
|
sl@0
|
1679 |
x = s4_2[0];
|
sl@0
|
1680 |
for(j=0;j<4;j++){
|
sl@0
|
1681 |
x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
|
sl@0
|
1682 |
}
|
sl@0
|
1683 |
x >>= s4_2[1];
|
sl@0
|
1684 |
d[0] = s1[0] + x;
|
sl@0
|
1685 |
|
sl@0
|
1686 |
n--;
|
sl@0
|
1687 |
d++;
|
sl@0
|
1688 |
s1++;
|
sl@0
|
1689 |
s2_nx4++;
|
sl@0
|
1690 |
}
|
sl@0
|
1691 |
if (n==0) return;
|
sl@0
|
1692 |
|
sl@0
|
1693 |
s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
|
sl@0
|
1694 |
|
sl@0
|
1695 |
n >>= 2;
|
sl@0
|
1696 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1697 |
" movq 0(%[s3_4]), %%mm0\n"
|
sl@0
|
1698 |
" pshufw $0x55, %%mm0, %%mm1\n"
|
sl@0
|
1699 |
" pshufw $0xaa, %%mm0, %%mm2\n"
|
sl@0
|
1700 |
" pshufw $0xff, %%mm0, %%mm3\n"
|
sl@0
|
1701 |
" pshufw $0x00, %%mm0, %%mm0\n"
|
sl@0
|
1702 |
" movzwl 0(%[s4_2]), %%ecx\n"
|
sl@0
|
1703 |
" movd %%ecx, %%mm7\n"
|
sl@0
|
1704 |
" pshufw $0x00, %%mm7, %%mm7\n"
|
sl@0
|
1705 |
" movzwl 2(%[s4_2]), %%ecx\n"
|
sl@0
|
1706 |
" movd %%ecx, %%mm6\n"
|
sl@0
|
1707 |
:
|
sl@0
|
1708 |
: [s3_4] "r" (s3_4),
|
sl@0
|
1709 |
[s4_2] "r" (s4_2)
|
sl@0
|
1710 |
: "ecx"
|
sl@0
|
1711 |
);
|
sl@0
|
1712 |
|
sl@0
|
1713 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1714 |
"1:\n"
|
sl@0
|
1715 |
" movq 0(%[s2_nx4]), %%mm4\n"
|
sl@0
|
1716 |
" pmullw %%mm0, %%mm4\n"
|
sl@0
|
1717 |
" movq (%[s2_nx4],%[sstr]), %%mm5\n"
|
sl@0
|
1718 |
" pmullw %%mm1, %%mm5\n"
|
sl@0
|
1719 |
" paddsw %%mm5,%%mm4\n"
|
sl@0
|
1720 |
" movq (%[s2_nx4],%[sstr],2), %%mm5\n"
|
sl@0
|
1721 |
" pmullw %%mm2, %%mm5\n"
|
sl@0
|
1722 |
" paddsw %%mm5,%%mm4\n"
|
sl@0
|
1723 |
" movq (%[s2_nx4_off]), %%mm5\n"
|
sl@0
|
1724 |
" pmullw %%mm3, %%mm5\n"
|
sl@0
|
1725 |
" paddsw %%mm5,%%mm4\n"
|
sl@0
|
1726 |
" paddsw %%mm7, %%mm4\n"
|
sl@0
|
1727 |
" psraw %%mm6, %%mm4\n"
|
sl@0
|
1728 |
" paddsw (%[s1]),%%mm4\n"
|
sl@0
|
1729 |
" movq %%mm4, 0(%[d])\n"
|
sl@0
|
1730 |
|
sl@0
|
1731 |
" addl $8, %[s2_nx4]\n"
|
sl@0
|
1732 |
" addl $8, %[s2_nx4_off]\n"
|
sl@0
|
1733 |
" addl $8, %[s1]\n"
|
sl@0
|
1734 |
" addl $8, %[d]\n"
|
sl@0
|
1735 |
" decl %[n]\n"
|
sl@0
|
1736 |
" jnz 1b\n"
|
sl@0
|
1737 |
" emms\n"
|
sl@0
|
1738 |
: [s2_nx4] "+r" (s2_nx4),
|
sl@0
|
1739 |
[d] "+r" (d),
|
sl@0
|
1740 |
[s2_nx4_off] "+r" (s2_nx4_off),
|
sl@0
|
1741 |
[n] "+m" (n),
|
sl@0
|
1742 |
[s1] "+r" (s1)
|
sl@0
|
1743 |
: [sstr] "r" (sstr2)
|
sl@0
|
1744 |
);
|
sl@0
|
1745 |
}
|
sl@0
|
1746 |
OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
|
sl@0
|
1747 |
OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1748 |
#endif
|
sl@0
|
1749 |
|
sl@0
|
1750 |
void
|
sl@0
|
1751 |
mas4_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx4, int sstr2,
|
sl@0
|
1752 |
int16_t *s3_4, int16_t *s4_2, int n)
|
sl@0
|
1753 |
{
|
sl@0
|
1754 |
int16_t *s2_nx4_off;
|
sl@0
|
1755 |
|
sl@0
|
1756 |
while (n&3) {
|
sl@0
|
1757 |
int x;
|
sl@0
|
1758 |
int j;
|
sl@0
|
1759 |
x = s4_2[0];
|
sl@0
|
1760 |
for(j=0;j<4;j++){
|
sl@0
|
1761 |
x += OIL_GET(s2_nx4, j*sstr2, int16_t)*s3_4[j];
|
sl@0
|
1762 |
}
|
sl@0
|
1763 |
x >>= s4_2[1];
|
sl@0
|
1764 |
d[0] = s1[0] + x;
|
sl@0
|
1765 |
|
sl@0
|
1766 |
n--;
|
sl@0
|
1767 |
d++;
|
sl@0
|
1768 |
s1++;
|
sl@0
|
1769 |
s2_nx4++;
|
sl@0
|
1770 |
}
|
sl@0
|
1771 |
if (n==0) return;
|
sl@0
|
1772 |
|
sl@0
|
1773 |
s2_nx4_off = OIL_OFFSET(s2_nx4, 3*sstr2);
|
sl@0
|
1774 |
|
sl@0
|
1775 |
n >>= 2;
|
sl@0
|
1776 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1777 |
" movq 0(%[s3_4]), %%mm0\n"
|
sl@0
|
1778 |
" pxor %%mm5, %%mm5\n"
|
sl@0
|
1779 |
" movd 0(%[s4_2]), %%mm5\n"
|
sl@0
|
1780 |
:
|
sl@0
|
1781 |
: [s3_4] "r" (s3_4),
|
sl@0
|
1782 |
[s4_2] "r" (s4_2)
|
sl@0
|
1783 |
);
|
sl@0
|
1784 |
|
sl@0
|
1785 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1786 |
"1:\n"
|
sl@0
|
1787 |
" pshufw $0x00, %%mm0, %%mm6\n"
|
sl@0
|
1788 |
" pmullw 0(%[s2_nx4]), %%mm6\n"
|
sl@0
|
1789 |
" pshufw $0x00, %%mm0, %%mm3\n"
|
sl@0
|
1790 |
" pmulhw 0(%[s2_nx4]), %%mm3\n"
|
sl@0
|
1791 |
" movq %%mm6, %%mm7\n"
|
sl@0
|
1792 |
" punpcklwd %%mm3, %%mm6\n"
|
sl@0
|
1793 |
" punpckhwd %%mm3, %%mm7\n"
|
sl@0
|
1794 |
|
sl@0
|
1795 |
" pshufw $0x55, %%mm0, %%mm2\n"
|
sl@0
|
1796 |
" pmullw 0(%[s2_nx4],%[sstr]), %%mm2\n"
|
sl@0
|
1797 |
" pshufw $0x55, %%mm0, %%mm3\n"
|
sl@0
|
1798 |
" pmulhw 0(%[s2_nx4],%[sstr]), %%mm3\n"
|
sl@0
|
1799 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1800 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1801 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1802 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1803 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1804 |
|
sl@0
|
1805 |
" pshufw $0xaa, %%mm0, %%mm2\n"
|
sl@0
|
1806 |
" pmullw 0(%[s2_nx4],%[sstr],2), %%mm2\n"
|
sl@0
|
1807 |
" pshufw $0xaa, %%mm0, %%mm3\n"
|
sl@0
|
1808 |
" pmulhw 0(%[s2_nx4],%[sstr],2), %%mm3\n"
|
sl@0
|
1809 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1810 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1811 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1812 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1813 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1814 |
|
sl@0
|
1815 |
" pshufw $0xff, %%mm0, %%mm2\n"
|
sl@0
|
1816 |
" pmullw 0(%[s2_nx4_off]), %%mm2\n"
|
sl@0
|
1817 |
" pshufw $0xff, %%mm0, %%mm3\n"
|
sl@0
|
1818 |
" pmulhw 0(%[s2_nx4_off]), %%mm3\n"
|
sl@0
|
1819 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1820 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1821 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1822 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1823 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1824 |
|
sl@0
|
1825 |
" pshufw $0xcc, %%mm5, %%mm1\n"
|
sl@0
|
1826 |
" paddd %%mm1, %%mm6\n"
|
sl@0
|
1827 |
" paddd %%mm1, %%mm7\n"
|
sl@0
|
1828 |
|
sl@0
|
1829 |
" pshufw $0xfd, %%mm5, %%mm1\n"
|
sl@0
|
1830 |
" psrad %%mm1, %%mm6\n"
|
sl@0
|
1831 |
" psrad %%mm1, %%mm7\n"
|
sl@0
|
1832 |
" packssdw %%mm7, %%mm6\n"
|
sl@0
|
1833 |
|
sl@0
|
1834 |
" paddsw (%[s1]),%%mm6\n"
|
sl@0
|
1835 |
" movq %%mm6, 0(%[d])\n"
|
sl@0
|
1836 |
|
sl@0
|
1837 |
" addl $8, %[s2_nx4]\n"
|
sl@0
|
1838 |
" addl $8, %[s2_nx4_off]\n"
|
sl@0
|
1839 |
" addl $8, %[s1]\n"
|
sl@0
|
1840 |
" addl $8, %[d]\n"
|
sl@0
|
1841 |
" decl %[n]\n"
|
sl@0
|
1842 |
" jnz 1b\n"
|
sl@0
|
1843 |
" emms\n"
|
sl@0
|
1844 |
: [s2_nx4] "+r" (s2_nx4),
|
sl@0
|
1845 |
[d] "+r" (d),
|
sl@0
|
1846 |
[s2_nx4_off] "+r" (s2_nx4_off),
|
sl@0
|
1847 |
[n] "+m" (n),
|
sl@0
|
1848 |
[s1] "+r" (s1)
|
sl@0
|
1849 |
: [sstr] "r" (sstr2)
|
sl@0
|
1850 |
);
|
sl@0
|
1851 |
}
|
sl@0
|
1852 |
OIL_DEFINE_IMPL_FULL (mas4_across_add_s16_mmx, mas4_across_add_s16,
|
sl@0
|
1853 |
OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
1854 |
|
sl@0
|
1855 |
void
|
sl@0
|
1856 |
mas8_across_add_s16_mmx (int16_t *d, int16_t *s1, int16_t *s2_nx8, int sstr2,
|
sl@0
|
1857 |
int16_t *s3_8, int16_t *s4_2, int n)
|
sl@0
|
1858 |
{
|
sl@0
|
1859 |
int16_t *s2_nx8_off;
|
sl@0
|
1860 |
void *tmp = NULL;
|
sl@0
|
1861 |
|
sl@0
|
1862 |
while (n&3) {
|
sl@0
|
1863 |
int x;
|
sl@0
|
1864 |
int j;
|
sl@0
|
1865 |
x = s4_2[0];
|
sl@0
|
1866 |
for(j=0;j<8;j++){
|
sl@0
|
1867 |
x += OIL_GET(s2_nx8, j*sstr2, int16_t)*s3_8[j];
|
sl@0
|
1868 |
}
|
sl@0
|
1869 |
x >>= s4_2[1];
|
sl@0
|
1870 |
d[0] = s1[0] + x;
|
sl@0
|
1871 |
|
sl@0
|
1872 |
n--;
|
sl@0
|
1873 |
d++;
|
sl@0
|
1874 |
s1++;
|
sl@0
|
1875 |
s2_nx8++;
|
sl@0
|
1876 |
}
|
sl@0
|
1877 |
if (n==0) return;
|
sl@0
|
1878 |
|
sl@0
|
1879 |
s2_nx8_off = OIL_OFFSET(s2_nx8, 7*sstr2);
|
sl@0
|
1880 |
|
sl@0
|
1881 |
n >>= 2;
|
sl@0
|
1882 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1883 |
" movq 0(%[s3_8]), %%mm0\n"
|
sl@0
|
1884 |
" pxor %%mm5, %%mm5\n"
|
sl@0
|
1885 |
" movd 0(%[s4_2]), %%mm5\n"
|
sl@0
|
1886 |
:
|
sl@0
|
1887 |
: [s3_8] "r" (s3_8),
|
sl@0
|
1888 |
[s4_2] "r" (s4_2)
|
sl@0
|
1889 |
);
|
sl@0
|
1890 |
|
sl@0
|
1891 |
__asm__ __volatile__ ("\n"
|
sl@0
|
1892 |
"1:\n"
|
sl@0
|
1893 |
" movl %[s2_nx8], %[tmp]\n"
|
sl@0
|
1894 |
" movq 0(%[s3_8]), %%mm0\n"
|
sl@0
|
1895 |
|
sl@0
|
1896 |
" pshufw $0x00, %%mm0, %%mm6\n"
|
sl@0
|
1897 |
" pmullw 0(%[tmp]), %%mm6\n"
|
sl@0
|
1898 |
" pshufw $0x00, %%mm0, %%mm3\n"
|
sl@0
|
1899 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1900 |
" movq %%mm6, %%mm7\n"
|
sl@0
|
1901 |
" punpcklwd %%mm3, %%mm6\n"
|
sl@0
|
1902 |
" punpckhwd %%mm3, %%mm7\n"
|
sl@0
|
1903 |
|
sl@0
|
1904 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1905 |
" pshufw $0x55, %%mm0, %%mm2\n"
|
sl@0
|
1906 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1907 |
" pshufw $0x55, %%mm0, %%mm3\n"
|
sl@0
|
1908 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1909 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1910 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1911 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1912 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1913 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1914 |
|
sl@0
|
1915 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1916 |
" pshufw $0xaa, %%mm0, %%mm2\n"
|
sl@0
|
1917 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1918 |
" pshufw $0xaa, %%mm0, %%mm3\n"
|
sl@0
|
1919 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1920 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1921 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1922 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1923 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1924 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1925 |
|
sl@0
|
1926 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1927 |
" pshufw $0xff, %%mm0, %%mm2\n"
|
sl@0
|
1928 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1929 |
" pshufw $0xff, %%mm0, %%mm3\n"
|
sl@0
|
1930 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1931 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1932 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1933 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1934 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1935 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1936 |
|
sl@0
|
1937 |
" movq 8(%[s3_8]), %%mm0\n"
|
sl@0
|
1938 |
|
sl@0
|
1939 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1940 |
" pshufw $0x00, %%mm0, %%mm2\n"
|
sl@0
|
1941 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1942 |
" pshufw $0x00, %%mm0, %%mm3\n"
|
sl@0
|
1943 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1944 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1945 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1946 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1947 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1948 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1949 |
|
sl@0
|
1950 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1951 |
" pshufw $0x55, %%mm0, %%mm2\n"
|
sl@0
|
1952 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1953 |
" pshufw $0x55, %%mm0, %%mm3\n"
|
sl@0
|
1954 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1955 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1956 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1957 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1958 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1959 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1960 |
|
sl@0
|
1961 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1962 |
" pshufw $0xaa, %%mm0, %%mm2\n"
|
sl@0
|
1963 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1964 |
" pshufw $0xaa, %%mm0, %%mm3\n"
|
sl@0
|
1965 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1966 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1967 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1968 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1969 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1970 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1971 |
|
sl@0
|
1972 |
" addl %[sstr], %[tmp]\n"
|
sl@0
|
1973 |
" pshufw $0xff, %%mm0, %%mm2\n"
|
sl@0
|
1974 |
" pmullw 0(%[tmp]), %%mm2\n"
|
sl@0
|
1975 |
" pshufw $0xff, %%mm0, %%mm3\n"
|
sl@0
|
1976 |
" pmulhw 0(%[tmp]), %%mm3\n"
|
sl@0
|
1977 |
" movq %%mm2, %%mm4\n"
|
sl@0
|
1978 |
" punpcklwd %%mm3, %%mm2\n"
|
sl@0
|
1979 |
" punpckhwd %%mm3, %%mm4\n"
|
sl@0
|
1980 |
" paddd %%mm2, %%mm6\n"
|
sl@0
|
1981 |
" paddd %%mm4, %%mm7\n"
|
sl@0
|
1982 |
|
sl@0
|
1983 |
" pshufw $0xcc, %%mm5, %%mm1\n"
|
sl@0
|
1984 |
" paddd %%mm1, %%mm6\n"
|
sl@0
|
1985 |
" paddd %%mm1, %%mm7\n"
|
sl@0
|
1986 |
|
sl@0
|
1987 |
" pshufw $0xfd, %%mm5, %%mm1\n"
|
sl@0
|
1988 |
" psrad %%mm1, %%mm6\n"
|
sl@0
|
1989 |
" psrad %%mm1, %%mm7\n"
|
sl@0
|
1990 |
" packssdw %%mm7, %%mm6\n"
|
sl@0
|
1991 |
|
sl@0
|
1992 |
" paddsw (%[s1]),%%mm6\n"
|
sl@0
|
1993 |
" movq %%mm6, 0(%[d])\n"
|
sl@0
|
1994 |
|
sl@0
|
1995 |
" addl $8, %[s2_nx8]\n"
|
sl@0
|
1996 |
" addl $8, %[s1]\n"
|
sl@0
|
1997 |
" addl $8, %[d]\n"
|
sl@0
|
1998 |
" decl %[n]\n"
|
sl@0
|
1999 |
" jnz 1b\n"
|
sl@0
|
2000 |
" emms\n"
|
sl@0
|
2001 |
: [s2_nx8] "+r" (s2_nx8),
|
sl@0
|
2002 |
[tmp] "+r" (tmp),
|
sl@0
|
2003 |
[s3_8] "+r" (s3_8),
|
sl@0
|
2004 |
[d] "+r" (d),
|
sl@0
|
2005 |
[n] "+m" (n),
|
sl@0
|
2006 |
[s1] "+r" (s1)
|
sl@0
|
2007 |
: [sstr] "m" (sstr2)
|
sl@0
|
2008 |
);
|
sl@0
|
2009 |
}
|
sl@0
|
2010 |
OIL_DEFINE_IMPL_FULL (mas8_across_add_s16_mmx, mas8_across_add_s16,
|
sl@0
|
2011 |
OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
|
sl@0
|
2012 |
|
sl@0
|
2013 |
void
|
sl@0
|
2014 |
lshift_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
|
sl@0
|
2015 |
{
|
sl@0
|
2016 |
while (n&3) {
|
sl@0
|
2017 |
d1[0] = s1[0]<<s3_1[0];
|
sl@0
|
2018 |
d1++;
|
sl@0
|
2019 |
s1++;
|
sl@0
|
2020 |
n--;
|
sl@0
|
2021 |
}
|
sl@0
|
2022 |
n >>= 2;
|
sl@0
|
2023 |
__asm__ __volatile__ ("\n"
|
sl@0
|
2024 |
" movzwl 0(%[s3_1]), %%ecx\n"
|
sl@0
|
2025 |
" movd %%ecx, %%mm1\n"
|
sl@0
|
2026 |
"1:\n"
|
sl@0
|
2027 |
" movq 0(%[s1]), %%mm0\n"
|
sl@0
|
2028 |
" psllw %%mm1, %%mm0\n"
|
sl@0
|
2029 |
" movq %%mm0, 0(%[d1])\n"
|
sl@0
|
2030 |
" add $8, %[d1]\n"
|
sl@0
|
2031 |
" add $8, %[s1]\n"
|
sl@0
|
2032 |
" decl %[n]\n"
|
sl@0
|
2033 |
" jnz 1b\n"
|
sl@0
|
2034 |
" emms"
|
sl@0
|
2035 |
: [d1] "+r" (d1),
|
sl@0
|
2036 |
[s1] "+r" (s1),
|
sl@0
|
2037 |
[n] "+r" (n)
|
sl@0
|
2038 |
: [s3_1] "r" (s3_1)
|
sl@0
|
2039 |
: "ecx");
|
sl@0
|
2040 |
}
|
sl@0
|
2041 |
OIL_DEFINE_IMPL_FULL (lshift_s16_mmx, lshift_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
2042 |
|
sl@0
|
2043 |
void
|
sl@0
|
2044 |
lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
|
sl@0
|
2045 |
{
|
sl@0
|
2046 |
while (n&7) {
|
sl@0
|
2047 |
d1[0] = s1[0]<<s3_1[0];
|
sl@0
|
2048 |
d1++;
|
sl@0
|
2049 |
s1++;
|
sl@0
|
2050 |
n--;
|
sl@0
|
2051 |
}
|
sl@0
|
2052 |
n >>= 3;
|
sl@0
|
2053 |
if (n == 0) return;
|
sl@0
|
2054 |
__asm__ __volatile__ ("\n"
|
sl@0
|
2055 |
" movzwl 0(%[s3_1]), %%ecx\n"
|
sl@0
|
2056 |
" movd %%ecx, %%mm1\n"
|
sl@0
|
2057 |
"1:\n"
|
sl@0
|
2058 |
" movq 0(%[s1]), %%mm0\n"
|
sl@0
|
2059 |
" psllw %%mm1, %%mm0\n"
|
sl@0
|
2060 |
" movq %%mm0, 0(%[d1])\n"
|
sl@0
|
2061 |
" movq 8(%[s1]), %%mm0\n"
|
sl@0
|
2062 |
" psllw %%mm1, %%mm0\n"
|
sl@0
|
2063 |
" movq %%mm0, 8(%[d1])\n"
|
sl@0
|
2064 |
" add $16, %[d1]\n"
|
sl@0
|
2065 |
" add $16, %[s1]\n"
|
sl@0
|
2066 |
" decl %[n]\n"
|
sl@0
|
2067 |
" jnz 1b\n"
|
sl@0
|
2068 |
" emms"
|
sl@0
|
2069 |
: [d1] "+r" (d1),
|
sl@0
|
2070 |
[s1] "+r" (s1),
|
sl@0
|
2071 |
[n] "+r" (n)
|
sl@0
|
2072 |
: [s3_1] "r" (s3_1)
|
sl@0
|
2073 |
: "ecx");
|
sl@0
|
2074 |
}
|
sl@0
|
2075 |
OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
|
sl@0
|
2076 |
|
sl@0
|
2077 |
|
sl@0
|
2078 |
|
sl@0
|
2079 |
|
sl@0
|
2080 |
#ifdef __SYMBIAN32__
|
sl@0
|
2081 |
|
sl@0
|
2082 |
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx, deinterleave2_s16() {
|
sl@0
|
2083 |
return &_oil_function_impl_deinterleave2_mmx, deinterleave2_s16;
|
sl@0
|
2084 |
}
|
sl@0
|
2085 |
#endif
|
sl@0
|
2086 |
|
sl@0
|
2087 |
#ifdef __SYMBIAN32__
|
sl@0
|
2088 |
|
sl@0
|
2089 |
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16() {
|
sl@0
|
2090 |
return &_oil_function_impl_deinterleave2_mmx_2, deinterleave2_s16;
|
sl@0
|
2091 |
}
|
sl@0
|
2092 |
#endif
|
sl@0
|
2093 |
|
sl@0
|
2094 |
#ifdef __SYMBIAN32__
|
sl@0
|
2095 |
|
sl@0
|
2096 |
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16() {
|
sl@0
|
2097 |
return &_oil_function_impl_deinterleave2_mmx_3, deinterleave2_s16;
|
sl@0
|
2098 |
}
|
sl@0
|
2099 |
#endif
|
sl@0
|
2100 |
|
sl@0
|
2101 |
#ifdef __SYMBIAN32__
|
sl@0
|
2102 |
|
sl@0
|
2103 |
OilFunctionImpl* __oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16() {
|
sl@0
|
2104 |
return &_oil_function_impl_deinterleave2_mmx_4, deinterleave2_s16;
|
sl@0
|
2105 |
}
|
sl@0
|
2106 |
#endif
|
sl@0
|
2107 |
|
sl@0
|
2108 |
#ifdef __SYMBIAN32__
|
sl@0
|
2109 |
|
sl@0
|
2110 |
OilFunctionImpl* __oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12() {
|
sl@0
|
2111 |
return &_oil_function_impl_lift_add_mult_shift12_i386_mmx, lift_add_mult_shift12;
|
sl@0
|
2112 |
}
|
sl@0
|
2113 |
#endif
|
sl@0
|
2114 |
|
sl@0
|
2115 |
#ifdef __SYMBIAN32__
|
sl@0
|
2116 |
|
sl@0
|
2117 |
OilFunctionImpl* __oil_function_impl_interleave2_mmx, interleave2_s16() {
|
sl@0
|
2118 |
return &_oil_function_impl_interleave2_mmx, interleave2_s16;
|
sl@0
|
2119 |
}
|
sl@0
|
2120 |
#endif
|
sl@0
|
2121 |
|
sl@0
|
2122 |
#ifdef __SYMBIAN32__
|
sl@0
|
2123 |
|
sl@0
|
2124 |
OilFunctionImpl* __oil_function_impl_lift_add_shift1_mmx, lift_add_shift1() {
|
sl@0
|
2125 |
return &_oil_function_impl_lift_add_shift1_mmx, lift_add_shift1;
|
sl@0
|
2126 |
}
|
sl@0
|
2127 |
#endif
|
sl@0
|
2128 |
|
sl@0
|
2129 |
#ifdef __SYMBIAN32__
|
sl@0
|
2130 |
|
sl@0
|
2131 |
OilFunctionImpl* __oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1() {
|
sl@0
|
2132 |
return &_oil_function_impl_lift_sub_shift1_mmx, lift_sub_shift1;
|
sl@0
|
2133 |
}
|
sl@0
|
2134 |
#endif
|
sl@0
|
2135 |
|
sl@0
|
2136 |
#ifdef __SYMBIAN32__
|
sl@0
|
2137 |
|
sl@0
|
2138 |
OilFunctionImpl* __oil_function_impl_lift_add_shift2_mmx, lift_add_shift2() {
|
sl@0
|
2139 |
return &_oil_function_impl_lift_add_shift2_mmx, lift_add_shift2;
|
sl@0
|
2140 |
}
|
sl@0
|
2141 |
#endif
|
sl@0
|
2142 |
|
sl@0
|
2143 |
#ifdef __SYMBIAN32__
|
sl@0
|
2144 |
|
sl@0
|
2145 |
OilFunctionImpl* __oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2() {
|
sl@0
|
2146 |
return &_oil_function_impl_lift_sub_shift2_mmx, lift_sub_shift2;
|
sl@0
|
2147 |
}
|
sl@0
|
2148 |
#endif
|
sl@0
|
2149 |
|
sl@0
|
2150 |
#ifdef __SYMBIAN32__
|
sl@0
|
2151 |
|
sl@0
|
2152 |
OilFunctionImpl* __oil_function_impl_synth_53_mmx, synth_53() {
|
sl@0
|
2153 |
return &_oil_function_impl_synth_53_mmx, synth_53;
|
sl@0
|
2154 |
}
|
sl@0
|
2155 |
#endif
|
sl@0
|
2156 |
|
sl@0
|
2157 |
#ifdef __SYMBIAN32__
|
sl@0
|
2158 |
|
sl@0
|
2159 |
OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
|
sl@0
|
2160 |
return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
|
sl@0
|
2161 |
}
|
sl@0
|
2162 |
#endif
|
sl@0
|
2163 |
|
sl@0
|
2164 |
#ifdef __SYMBIAN32__
|
sl@0
|
2165 |
|
sl@0
|
2166 |
OilFunctionImpl* __oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16() {
|
sl@0
|
2167 |
return &_oil_function_impl_mas2_add_s16_lim_mmx, mas2_add_s16;
|
sl@0
|
2168 |
}
|
sl@0
|
2169 |
#endif
|
sl@0
|
2170 |
|
sl@0
|
2171 |
#ifdef __SYMBIAN32__
|
sl@0
|
2172 |
|
sl@0
|
2173 |
OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
|
sl@0
|
2174 |
return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
|
sl@0
|
2175 |
}
|
sl@0
|
2176 |
#endif
|
sl@0
|
2177 |
|
sl@0
|
2178 |
#ifdef __SYMBIAN32__
|
sl@0
|
2179 |
|
sl@0
|
2180 |
OilFunctionImpl* __oil_function_impl_mas2_add_s16_mmx, mas2_add_s16() {
|
sl@0
|
2181 |
return &_oil_function_impl_mas2_add_s16_mmx, mas2_add_s16;
|
sl@0
|
2182 |
}
|
sl@0
|
2183 |
#endif
|
sl@0
|
2184 |
|
sl@0
|
2185 |
#ifdef __SYMBIAN32__
|
sl@0
|
2186 |
|
sl@0
|
2187 |
OilFunctionImpl* __oil_function_impl_mas4_add_s16_mmx, mas4_add_s16() {
|
sl@0
|
2188 |
return &_oil_function_impl_mas4_add_s16_mmx, mas4_add_s16;
|
sl@0
|
2189 |
}
|
sl@0
|
2190 |
#endif
|
sl@0
|
2191 |
|
sl@0
|
2192 |
#ifdef __SYMBIAN32__
|
sl@0
|
2193 |
|
sl@0
|
2194 |
OilFunctionImpl* __oil_function_impl_mas8_add_s16_mmx, mas8_add_s16() {
|
sl@0
|
2195 |
return &_oil_function_impl_mas8_add_s16_mmx, mas8_add_s16;
|
sl@0
|
2196 |
}
|
sl@0
|
2197 |
#endif
|
sl@0
|
2198 |
|
sl@0
|
2199 |
#ifdef __SYMBIAN32__
|
sl@0
|
2200 |
|
sl@0
|
2201 |
OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16() {
|
sl@0
|
2202 |
return &_oil_function_impl_mas4_add_s16_pmaddwd, mas4_add_s16;
|
sl@0
|
2203 |
}
|
sl@0
|
2204 |
#endif
|
sl@0
|
2205 |
|
sl@0
|
2206 |
#ifdef __SYMBIAN32__
|
sl@0
|
2207 |
|
sl@0
|
2208 |
OilFunctionImpl* __oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16() {
|
sl@0
|
2209 |
return &_oil_function_impl_mas4_add_s16_pmaddwd_2, mas4_add_s16;
|
sl@0
|
2210 |
}
|
sl@0
|
2211 |
#endif
|
sl@0
|
2212 |
|
sl@0
|
2213 |
#ifdef __SYMBIAN32__
|
sl@0
|
2214 |
|
sl@0
|
2215 |
OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16() {
|
sl@0
|
2216 |
return &_oil_function_impl_mas8_add_s16_pmaddwd, mas8_add_s16;
|
sl@0
|
2217 |
}
|
sl@0
|
2218 |
#endif
|
sl@0
|
2219 |
|
sl@0
|
2220 |
#ifdef __SYMBIAN32__
|
sl@0
|
2221 |
|
sl@0
|
2222 |
OilFunctionImpl* __oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16() {
|
sl@0
|
2223 |
return &_oil_function_impl_mas8_add_s16_pmaddwd2, mas8_add_s16;
|
sl@0
|
2224 |
}
|
sl@0
|
2225 |
#endif
|
sl@0
|
2226 |
|
sl@0
|
2227 |
#ifdef __SYMBIAN32__
|
sl@0
|
2228 |
|
sl@0
|
2229 |
OilFunctionImpl* __oil_function_impl_mas8_add_s16_sse2, mas8_add_s16() {
|
sl@0
|
2230 |
return &_oil_function_impl_mas8_add_s16_sse2, mas8_add_s16;
|
sl@0
|
2231 |
}
|
sl@0
|
2232 |
#endif
|
sl@0
|
2233 |
|
sl@0
|
2234 |
#ifdef __SYMBIAN32__
|
sl@0
|
2235 |
|
sl@0
|
2236 |
OilFunctionImpl* __oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16() {
|
sl@0
|
2237 |
return &_oil_function_impl_mas2_across_add_s16_mmx, mas2_across_add_s16;
|
sl@0
|
2238 |
}
|
sl@0
|
2239 |
#endif
|
sl@0
|
2240 |
|
sl@0
|
2241 |
#ifdef __SYMBIAN32__
|
sl@0
|
2242 |
|
sl@0
|
2243 |
OilFunctionImpl* __oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16() {
|
sl@0
|
2244 |
return &_oil_function_impl_add_const_rshift_s16_mmx, add_const_rshift_s16;
|
sl@0
|
2245 |
}
|
sl@0
|
2246 |
#endif
|
sl@0
|
2247 |
|
sl@0
|
2248 |
#ifdef __SYMBIAN32__
|
sl@0
|
2249 |
|
sl@0
|
2250 |
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16() {
|
sl@0
|
2251 |
return &_oil_function_impl_multiply_and_add_s16_mmx, multiply_and_add_s16;
|
sl@0
|
2252 |
}
|
sl@0
|
2253 |
#endif
|
sl@0
|
2254 |
|
sl@0
|
2255 |
#ifdef __SYMBIAN32__
|
sl@0
|
2256 |
|
sl@0
|
2257 |
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8() {
|
sl@0
|
2258 |
return &_oil_function_impl_multiply_and_add_s16_u8_mmx, multiply_and_add_s16_u8;
|
sl@0
|
2259 |
}
|
sl@0
|
2260 |
#endif
|
sl@0
|
2261 |
|
sl@0
|
2262 |
#ifdef __SYMBIAN32__
|
sl@0
|
2263 |
|
sl@0
|
2264 |
OilFunctionImpl* __oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8() {
|
sl@0
|
2265 |
return &_oil_function_impl_multiply_and_add_s16_u8_mmx_2, multiply_and_add_s16_u8;
|
sl@0
|
2266 |
}
|
sl@0
|
2267 |
#endif
|
sl@0
|
2268 |
|
sl@0
|
2269 |
#ifdef __SYMBIAN32__
|
sl@0
|
2270 |
|
sl@0
|
2271 |
OilFunctionImpl* __oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx() {
|
sl@0
|
2272 |
return &_oil_function_impl_multiply_and_acc_12xn_s16_u8_mmx;
|
sl@0
|
2273 |
}
|
sl@0
|
2274 |
#endif
|
sl@0
|
2275 |
|
sl@0
|
2276 |
#ifdef __SYMBIAN32__
|
sl@0
|
2277 |
|
sl@0
|
2278 |
OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
|
sl@0
|
2279 |
return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
|
sl@0
|
2280 |
}
|
sl@0
|
2281 |
#endif
|
sl@0
|
2282 |
|
sl@0
|
2283 |
#ifdef __SYMBIAN32__
|
sl@0
|
2284 |
|
sl@0
|
2285 |
OilFunctionImpl* __oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16() {
|
sl@0
|
2286 |
return &_oil_function_impl_mas4_across_add_s16_mmx, mas4_across_add_s16;
|
sl@0
|
2287 |
}
|
sl@0
|
2288 |
#endif
|
sl@0
|
2289 |
|
sl@0
|
2290 |
#ifdef __SYMBIAN32__
|
sl@0
|
2291 |
|
sl@0
|
2292 |
OilFunctionImpl* __oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16() {
|
sl@0
|
2293 |
return &_oil_function_impl_mas8_across_add_s16_mmx, mas8_across_add_s16;
|
sl@0
|
2294 |
}
|
sl@0
|
2295 |
#endif
|
sl@0
|
2296 |
|
sl@0
|
2297 |
#ifdef __SYMBIAN32__
|
sl@0
|
2298 |
|
sl@0
|
2299 |
OilFunctionImpl* __oil_function_impl_lshift_s16_mmx, lshift_s16() {
|
sl@0
|
2300 |
return &_oil_function_impl_lshift_s16_mmx, lshift_s16;
|
sl@0
|
2301 |
}
|
sl@0
|
2302 |
#endif
|
sl@0
|
2303 |
|
sl@0
|
2304 |
#ifdef __SYMBIAN32__
|
sl@0
|
2305 |
|
sl@0
|
2306 |
OilFunctionImpl* __oil_function_impl_lshift_s16_mmx_2, lshift_s16() {
|
sl@0
|
2307 |
return &_oil_function_impl_lshift_s16_mmx_2, lshift_s16;
|
sl@0
|
2308 |
}
|
sl@0
|
2309 |
#endif
|
sl@0
|
2310 |
|
sl@0
|
2311 |
|
sl@0
|
2312 |
|
sl@0
|
2313 |
#ifdef __SYMBIAN32__
|
sl@0
|
2314 |
|
sl@0
|
2315 |
OilFunctionImpl* __oil_function_impl_split_53_nomix() {
|
sl@0
|
2316 |
return &_oil_function_impl_split_53_nomix;
|
sl@0
|
2317 |
}
|
sl@0
|
2318 |
#endif
|
sl@0
|
2319 |
|
sl@0
|
2320 |
#ifdef __SYMBIAN32__
|
sl@0
|
2321 |
|
sl@0
|
2322 |
OilFunctionImpl* __oil_function_impl_split_53_c() {
|
sl@0
|
2323 |
return &_oil_function_impl_split_53_c;
|
sl@0
|
2324 |
}
|
sl@0
|
2325 |
#endif
|
sl@0
|
2326 |
|
sl@0
|
2327 |
#ifdef __SYMBIAN32__
|
sl@0
|
2328 |
|
sl@0
|
2329 |
OilFunctionImpl* __oil_function_impl_synth_53_c() {
|
sl@0
|
2330 |
return &_oil_function_impl_synth_53_c;
|
sl@0
|
2331 |
}
|
sl@0
|
2332 |
#endif
|
sl@0
|
2333 |
|
sl@0
|
2334 |
#ifdef __SYMBIAN32__
|
sl@0
|
2335 |
|
sl@0
|
2336 |
OilFunctionImpl* __oil_function_impl_deinterleave2_c_1() {
|
sl@0
|
2337 |
return &_oil_function_impl_deinterleave2_c_1;
|
sl@0
|
2338 |
}
|
sl@0
|
2339 |
#endif
|
sl@0
|
2340 |
|
sl@0
|
2341 |
#ifdef __SYMBIAN32__
|
sl@0
|
2342 |
|
sl@0
|
2343 |
OilFunctionImpl* __oil_function_impl_deinterleave2_asm() {
|
sl@0
|
2344 |
return &_oil_function_impl_deinterleave2_asm;
|
sl@0
|
2345 |
}
|
sl@0
|
2346 |
#endif
|
sl@0
|
2347 |
|