suppressed no longer needed emms()
[libav.git] / libavcodec / i386 / dsputil_mmx_avg.h
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
23{
24 int dh, hh;
25 UINT8 *p;
26 const UINT8 *pix;
27 p = block;
28 pix = pixels;
29 hh=h>>2;
30 dh=h&3;
31 while(hh--) {
32 __asm __volatile(
33 "movq %4, %%mm0\n\t"
34 "movq 1%4, %%mm1\n\t"
35 "movq %5, %%mm2\n\t"
36 "movq 1%5, %%mm3\n\t"
37 "movq %6, %%mm4\n\t"
38 "movq 1%6, %%mm5\n\t"
39 "movq %7, %%mm6\n\t"
40 "movq 1%7, %%mm7\n\t"
41 PAVGB" %%mm1, %%mm0\n\t"
42 PAVGB" %%mm3, %%mm2\n\t"
43 PAVGB" %%mm5, %%mm4\n\t"
44 PAVGB" %%mm7, %%mm6\n\t"
45 "movq %%mm0, %0\n\t"
46 "movq %%mm2, %1\n\t"
47 "movq %%mm4, %2\n\t"
48 "movq %%mm6, %3\n\t"
49 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
50 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
51 :"memory");
52 pix += line_size*4; p += line_size*4;
53 }
54 while(dh--) {
55 __asm __volatile(
56 "movq %1, %%mm0\n\t"
57 "movq 1%1, %%mm1\n\t"
58 PAVGB" %%mm1, %%mm0\n\t"
59 "movq %%mm0, %0\n\t"
60 :"=m"(*p)
61 :"m"(*pix)
62 :"memory");
63 pix += line_size; p += line_size;
64 }
de6d9b64
FB
65}
66
67static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
68{
69 int dh, hh;
70 UINT8 *p;
71 const UINT8 *pix;
72 p = block;
73 pix = pixels;
74
75 hh=h>>1;
76 dh=h&1;
77 while(hh--) {
78 __asm __volatile(
79 "movq %2, %%mm0\n\t"
80 "movq %3, %%mm1\n\t"
81 "movq %4, %%mm2\n\t"
82 PAVGB" %%mm1, %%mm0\n\t"
83 PAVGB" %%mm2, %%mm1\n\t"
84 "movq %%mm0, %0\n\t"
85 "movq %%mm1, %1\n\t"
86 :"=m"(*p), "=m"(*(p+line_size))
87 :"m"(*pix), "m"(*(pix+line_size)),
88 "m"(*(pix+line_size*2))
89 :"memory");
90 pix += line_size*2;
91 p += line_size*2;
92 }
93 if(dh) {
94 __asm __volatile(
95 "movq %1, %%mm0\n\t"
96 "movq %2, %%mm1\n\t"
97 PAVGB" %%mm1, %%mm0\n\t"
98 "movq %%mm0, %0\n\t"
99 :"=m"(*p)
100 :"m"(*pix),
101 "m"(*(pix+line_size))
102 :"memory");
103 }
de6d9b64
FB
104}
105
106static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
107{
108 int dh, hh;
109 UINT8 *p;
110 const UINT8 *pix;
111 p = block;
112 pix = pixels;
113 hh=h>>2;
114 dh=h&3;
115 while(hh--) {
116 __asm __volatile(
117 "movq %0, %%mm0\n\t"
118 "movq %4, %%mm1\n\t"
119 "movq %1, %%mm2\n\t"
120 "movq %5, %%mm3\n\t"
121 "movq %2, %%mm4\n\t"
122 "movq %6, %%mm5\n\t"
123 "movq %3, %%mm6\n\t"
124 "movq %7, %%mm7\n\t"
125 PAVGB" %%mm1, %%mm0\n\t"
126 PAVGB" %%mm3, %%mm2\n\t"
127 PAVGB" %%mm5, %%mm4\n\t"
128 PAVGB" %%mm7, %%mm6\n\t"
129 "movq %%mm0, %0\n\t"
130 "movq %%mm2, %1\n\t"
131 "movq %%mm4, %2\n\t"
132 "movq %%mm6, %3\n\t"
133 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
134 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
135 :"memory");
136 pix += line_size*4; p += line_size*4;
137 }
138 while(dh--) {
139 __asm __volatile(
140 "movq %0, %%mm0\n\t"
141 "movq %1, %%mm1\n\t"
142 PAVGB" %%mm1, %%mm0\n\t"
143 "movq %%mm0, %0\n\t"
144 :"=m"(*p)
145 :"m"(*pix)
146 :"memory");
147 pix += line_size; p += line_size;
148 }
de6d9b64
FB
149}
150
151static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
152{
153 int dh, hh;
154 UINT8 *p;
155 const UINT8 *pix;
156 p = block;
157 pix = pixels;
158 hh=h>>1;
159 dh=h&1;
160 while(hh--) {
161 __asm __volatile(
162 "movq %2, %%mm2\n\t"
163 "movq 1%2, %%mm3\n\t"
164 "movq %3, %%mm4\n\t"
165 "movq 1%3, %%mm5\n\t"
166 "movq %0, %%mm0\n\t"
167 "movq %1, %%mm1\n\t"
168 PAVGB" %%mm3, %%mm2\n\t"
169 PAVGB" %%mm2, %%mm0\n\t"
170 PAVGB" %%mm5, %%mm4\n\t"
171 PAVGB" %%mm4, %%mm1\n\t"
172 "movq %%mm0, %0\n\t"
173 "movq %%mm1, %1\n\t"
174 :"=m"(*p), "=m"(*(p+line_size))
175 :"m"(*pix), "m"(*(pix+line_size))
176 :"memory");
177 pix += line_size*2;
178 p += line_size*2;
179 }
180 if(dh) {
181 __asm __volatile(
182 "movq %1, %%mm1\n\t"
183 "movq 1%1, %%mm2\n\t"
184 "movq %0, %%mm0\n\t"
185 PAVGB" %%mm2, %%mm1\n\t"
186 PAVGB" %%mm1, %%mm0\n\t"
187 "movq %%mm0, %0\n\t"
188 :"=m"(*p)
189 :"m"(*pix)
190 :"memory");
191 }
de6d9b64
FB
192}
193
194static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
195{
196 int dh, hh;
197 UINT8 *p;
198 const UINT8 *pix;
199 p = block;
200 pix = pixels;
201 hh=h>>1;
202 dh=h&1;
203 while(hh--) {
204 __asm __volatile(
205 "movq %2, %%mm2\n\t"
206 "movq %3, %%mm3\n\t"
207 "movq %3, %%mm4\n\t"
208 "movq %4, %%mm5\n\t"
209 "movq %0, %%mm0\n\t"
210 "movq %1, %%mm1\n\t"
211 PAVGB" %%mm3, %%mm2\n\t"
212 PAVGB" %%mm2, %%mm0\n\t"
213 PAVGB" %%mm5, %%mm4\n\t"
214 PAVGB" %%mm4, %%mm1\n\t"
215 "movq %%mm0, %0\n\t"
216 "movq %%mm1, %1\n\t"
217 :"=m"(*p), "=m"(*(p+line_size))
218 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
219 :"memory");
220 pix += line_size*2;
221 p += line_size*2;
222 }
223 if(dh) {
224 __asm __volatile(
225 "movq %1, %%mm1\n\t"
226 "movq %2, %%mm2\n\t"
227 "movq %0, %%mm0\n\t"
228 PAVGB" %%mm2, %%mm1\n\t"
229 PAVGB" %%mm1, %%mm0\n\t"
230 "movq %%mm0, %0\n\t"
231 :"=m"(*p)
232 :"m"(*pix), "m"(*(pix+line_size))
233 :"memory");
234 }
de6d9b64
FB
235}
236
237static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
238{
239 UINT8 *p;
240 const UINT8 *pix;
241 p = block;
242 pix = pixels;
243 __asm __volatile(
244 "pxor %%mm7, %%mm7\n\t"
245 "movq %0, %%mm6\n\t"
246 ::"m"(mm_wtwo[0]):"memory");
247 do {
248 __asm __volatile(
249 "movq %1, %%mm0\n\t"
250 "movq %2, %%mm1\n\t"
251 "movq 1%1, %%mm4\n\t"
252 "movq 1%2, %%mm5\n\t"
253 "movq %%mm0, %%mm2\n\t"
254 "movq %%mm1, %%mm3\n\t"
255 "punpcklbw %%mm7, %%mm0\n\t"
256 "punpcklbw %%mm7, %%mm1\n\t"
257 "punpckhbw %%mm7, %%mm2\n\t"
258 "punpckhbw %%mm7, %%mm3\n\t"
259 "paddusw %%mm1, %%mm0\n\t"
260 "paddusw %%mm3, %%mm2\n\t"
261 "movq %%mm4, %%mm1\n\t"
262 "movq %%mm5, %%mm3\n\t"
263 "punpcklbw %%mm7, %%mm4\n\t"
264 "punpcklbw %%mm7, %%mm5\n\t"
265 "punpckhbw %%mm7, %%mm1\n\t"
266 "punpckhbw %%mm7, %%mm3\n\t"
267 "paddusw %%mm5, %%mm4\n\t"
268 "paddusw %%mm3, %%mm1\n\t"
269 "paddusw %%mm6, %%mm4\n\t"
270 "paddusw %%mm6, %%mm1\n\t"
271 "paddusw %%mm4, %%mm0\n\t"
272 "paddusw %%mm1, %%mm2\n\t"
273 "psrlw $2, %%mm0\n\t"
274 "psrlw $2, %%mm2\n\t"
275 "packuswb %%mm2, %%mm0\n\t"
276 PAVGB" %0, %%mm0\n\t"
277 "movq %%mm0, %0\n\t"
278 :"=m"(*p)
279 :"m"(*pix),
280 "m"(*(pix+line_size))
281 :"memory");
282 pix += line_size;
283 p += line_size ;
284 } while(--h);
de6d9b64
FB
285}
286
287static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
288{
289 DCTELEM *p;
290 const UINT8 *pix;
291 p = block;
292 pix = pixels;
293 __asm __volatile(
294 "pxor %%mm7, %%mm7":::"memory");
295 do {
296 __asm __volatile(
297 "movq 1%1, %%mm2\n\t"
298 "movq %0, %%mm0\n\t"
299 PAVGB" %1, %%mm2\n\t"
300 "movq 8%0, %%mm1\n\t"
301 "movq %%mm2, %%mm3\n\t"
302 "punpcklbw %%mm7, %%mm2\n\t"
303 "punpckhbw %%mm7, %%mm3\n\t"
304 "psubsw %%mm2, %%mm0\n\t"
305 "psubsw %%mm3, %%mm1\n\t"
306 "movq %%mm0, %0\n\t"
307 "movq %%mm1, 8%0\n\t"
308 :"=m"(*p)
309 :"m"(*pix)
310 :"memory");
311 pix += line_size;
312 p += 8;
313 } while (--h);
de6d9b64
FB
314}
315
316static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
317{
318 DCTELEM *p;
319 const UINT8 *pix;
320 p = block;
321 pix = pixels;
322 __asm __volatile(
323 "pxor %%mm7, %%mm7":::"memory");
324 do {
325 __asm __volatile(
326 "movq %2, %%mm2\n\t"
327 "movq %0, %%mm0\n\t"
328 PAVGB" %1, %%mm2\n\t"
329 "movq 8%0, %%mm1\n\t"
330 "movq %%mm2, %%mm3\n\t"
331 "punpcklbw %%mm7, %%mm2\n\t"
332 "punpckhbw %%mm7, %%mm3\n\t"
333 "psubsw %%mm2, %%mm0\n\t"
334 "psubsw %%mm3, %%mm1\n\t"
335 "movq %%mm0, %0\n\t"
336 "movq %%mm1, 8%0\n\t"
337 :"=m"(*p)
338 :"m"(*pix), "m"(*(pix+line_size))
339 :"memory");
340 pix += line_size;
341 p += 8;
342 } while (--h);
de6d9b64
FB
343}
344