First public contribution.
2 * LIBOIL - Library of Optimized Inner Loops
3 * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
27 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
33 #include <liboil/liboilfunction.h>
35 OIL_DECLARE_CLASS (rowsad8x8_u8);
36 OIL_DECLARE_CLASS (colsad8x8_u8);
39 rowsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, uint8_t *src2)
42 #if !defined(__WINSCW__) && !defined(__WINS__)
43 __asm__ __volatile__ (
44 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
45 " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
46 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
47 " movq (%2), %%mm1 \n\t"
49 " movq %%mm0, %%mm2 \n\t"
50 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
51 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
52 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
54 " movq %%mm0, %%mm1 \n\t"
56 " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
57 " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
59 " movq %%mm0, %%mm2 \n\t"
60 " movq %%mm1, %%mm3 \n\t"
61 " psrlq $32, %%mm2 \n\t" /* fold and add */
62 " psrlq $32, %%mm3 \n\t"
63 " paddw %%mm2, %%mm0 \n\t"
64 " paddw %%mm3, %%mm1 \n\t"
65 " movq %%mm0, %%mm2 \n\t"
66 " movq %%mm1, %%mm3 \n\t"
67 " psrlq $16, %%mm2 \n\t"
68 " psrlq $16, %%mm3 \n\t"
69 " paddw %%mm2, %%mm0 \n\t"
70 " paddw %%mm3, %%mm1 \n\t"
72 " psubusw %%mm0, %%mm1 \n\t"
73 " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
74 " movd %%mm1, %0 \n\t"
75 " andl $0xffff, %0 \n\t"
87 OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmx, rowsad8x8_u8, OIL_IMPL_FLAG_MMX);
90 rowsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, uint8_t *src2)
92 #if !defined(__WINSCW__) && !defined(__WINS__)
95 __asm__ __volatile__ (
96 " movd (%1), %%mm0 \n\t"
97 " movd (%2), %%mm1 \n\t"
98 " psadbw %%mm0, %%mm1 \n\t"
99 " movd 4(%1), %%mm2 \n\t"
100 " movd 4(%2), %%mm3 \n\t"
101 " psadbw %%mm2, %%mm3 \n\t"
103 " pmaxsw %%mm1, %%mm3 \n\t"
104 " movd %%mm3, %0 \n\t"
105 " andl $0xffff, %0 \n\t"
117 OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmxext, rowsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
120 colsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
122 #if !defined(__WINSCW__) && !defined(__WINS__)
125 __asm__ __volatile__ (
126 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
127 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
128 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
129 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
130 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
131 " mov $4, %%edi \n\t" /* 4 rows */
133 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
134 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
136 " movq %%mm0, %%mm2 \n\t"
137 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
138 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
139 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
140 " movq %%mm0, %%mm1 \n\t"
142 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
143 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
144 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
145 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
146 " add %3, %1 \n\t" /* Inc pointer into the new data */
147 " add %3, %2 \n\t" /* Inc pointer into the new data */
152 " mov $4, %%edi \n\t" /* 4 rows */
154 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
155 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
157 " movq %%mm0, %%mm2 \n\t"
158 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
159 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
160 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
161 " movq %%mm0, %%mm1 \n\t"
163 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
164 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
165 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
166 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
167 " add %3, %1 \n\t" /* Inc pointer into the new data */
168 " add %3, %2 \n\t" /* Inc pointer into the new data */
173 " psubusw %%mm6, %%mm7 \n\t"
174 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
175 " psubusw %%mm4, %%mm5 \n\t"
176 " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
177 " psubusw %%mm5, %%mm7 \n\t"
178 " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
179 " movq %%mm7, %%mm6 \n\t"
180 " psrlq $32, %%mm6 \n\t"
181 " psubusw %%mm6, %%mm7 \n\t"
182 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
183 " movq %%mm7, %%mm6 \n\t"
184 " psrlq $16, %%mm6 \n\t"
185 " psubusw %%mm6, %%mm7 \n\t"
186 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
187 " movd %%mm7, %0 \n\t"
188 " andl $0xffff, %0 \n\t"
200 OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmx, colsad8x8_u8, OIL_IMPL_FLAG_MMX);
203 colsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
205 #if !defined(__WINSCW__) && !defined(__WINS__)
208 __asm__ __volatile__ (
209 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
210 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
211 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
212 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
213 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
214 " mov $4, %%edi \n\t" /* 4 rows */
216 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
217 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
219 " movq %%mm0, %%mm2 \n\t"
220 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
221 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
222 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
223 " movq %%mm0, %%mm1 \n\t"
225 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
226 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
227 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
228 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
229 " add %3, %1 \n\t" /* Inc pointer into the new data */
230 " add %3, %2 \n\t" /* Inc pointer into the new data */
235 " mov $4, %%edi \n\t" /* 4 rows */
237 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
238 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
240 " movq %%mm0, %%mm2 \n\t"
241 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
242 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
243 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
244 " movq %%mm0, %%mm1 \n\t"
246 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
247 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
248 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
249 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
250 " add %3, %1 \n\t" /* Inc pointer into the new data */
251 " add %3, %2 \n\t" /* Inc pointer into the new data */
256 " pmaxsw %%mm6, %%mm7 \n\t"
257 " pmaxsw %%mm4, %%mm5 \n\t"
258 " pmaxsw %%mm5, %%mm7 \n\t"
259 " movq %%mm7, %%mm6 \n\t"
260 " psrlq $32, %%mm6 \n\t"
261 " pmaxsw %%mm6, %%mm7 \n\t"
262 " movq %%mm7, %%mm6 \n\t"
263 " psrlq $16, %%mm6 \n\t"
264 " pmaxsw %%mm6, %%mm7 \n\t"
265 " movd %%mm7, %0 \n\t"
266 " andl $0xffff, %0 \n\t"
279 OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmxext, colsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
285 OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8() {
286 return &_oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8;
292 OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8() {
293 return &_oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8;
299 OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8() {
300 return &_oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8;
306 OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8() {
307 return &_oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8;