e47b83f62de633d2f72a9089f8772e9df5b94651
[libav.git] / libavcodec / i386 / dsputil_mmx_avg.h
1 /*
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
20 */
21
22 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
23 {
24 int dh, hh;
25 UINT8 *p;
26 const UINT8 *pix;
27 p = block;
28 pix = pixels;
29 hh=h>>2;
30 dh=h&3;
31 while(hh--) {
32 __asm __volatile(
33 "movq %4, %%mm0\n\t"
34 "movq 1%4, %%mm1\n\t"
35 "movq %5, %%mm2\n\t"
36 "movq 1%5, %%mm3\n\t"
37 "movq %6, %%mm4\n\t"
38 "movq 1%6, %%mm5\n\t"
39 "movq %7, %%mm6\n\t"
40 "movq 1%7, %%mm7\n\t"
41 PAVGB" %%mm1, %%mm0\n\t"
42 PAVGB" %%mm3, %%mm2\n\t"
43 PAVGB" %%mm5, %%mm4\n\t"
44 PAVGB" %%mm7, %%mm6\n\t"
45 "movq %%mm0, %0\n\t"
46 "movq %%mm2, %1\n\t"
47 "movq %%mm4, %2\n\t"
48 "movq %%mm6, %3\n\t"
49 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
50 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
51 :"memory");
52 pix += line_size*4; p += line_size*4;
53 }
54 while(dh--) {
55 __asm __volatile(
56 "movq %1, %%mm0\n\t"
57 "movq 1%1, %%mm1\n\t"
58 PAVGB" %%mm1, %%mm0\n\t"
59 "movq %%mm0, %0\n\t"
60 :"=m"(*p)
61 :"m"(*pix)
62 :"memory");
63 pix += line_size; p += line_size;
64 }
65 emms();
66 }
67
68 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
69 {
70 int dh, hh;
71 UINT8 *p;
72 const UINT8 *pix;
73 p = block;
74 pix = pixels;
75
76 hh=h>>1;
77 dh=h&1;
78 while(hh--) {
79 __asm __volatile(
80 "movq %2, %%mm0\n\t"
81 "movq %3, %%mm1\n\t"
82 "movq %4, %%mm2\n\t"
83 PAVGB" %%mm1, %%mm0\n\t"
84 PAVGB" %%mm2, %%mm1\n\t"
85 "movq %%mm0, %0\n\t"
86 "movq %%mm1, %1\n\t"
87 :"=m"(*p), "=m"(*(p+line_size))
88 :"m"(*pix), "m"(*(pix+line_size)),
89 "m"(*(pix+line_size*2))
90 :"memory");
91 pix += line_size*2;
92 p += line_size*2;
93 }
94 if(dh) {
95 __asm __volatile(
96 "movq %1, %%mm0\n\t"
97 "movq %2, %%mm1\n\t"
98 PAVGB" %%mm1, %%mm0\n\t"
99 "movq %%mm0, %0\n\t"
100 :"=m"(*p)
101 :"m"(*pix),
102 "m"(*(pix+line_size))
103 :"memory");
104 }
105 emms();
106 }
107
108 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
109 {
110 int dh, hh;
111 UINT8 *p;
112 const UINT8 *pix;
113 p = block;
114 pix = pixels;
115 hh=h>>2;
116 dh=h&3;
117 while(hh--) {
118 __asm __volatile(
119 "movq %0, %%mm0\n\t"
120 "movq %4, %%mm1\n\t"
121 "movq %1, %%mm2\n\t"
122 "movq %5, %%mm3\n\t"
123 "movq %2, %%mm4\n\t"
124 "movq %6, %%mm5\n\t"
125 "movq %3, %%mm6\n\t"
126 "movq %7, %%mm7\n\t"
127 PAVGB" %%mm1, %%mm0\n\t"
128 PAVGB" %%mm3, %%mm2\n\t"
129 PAVGB" %%mm5, %%mm4\n\t"
130 PAVGB" %%mm7, %%mm6\n\t"
131 "movq %%mm0, %0\n\t"
132 "movq %%mm2, %1\n\t"
133 "movq %%mm4, %2\n\t"
134 "movq %%mm6, %3\n\t"
135 :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
136 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
137 :"memory");
138 pix += line_size*4; p += line_size*4;
139 }
140 while(dh--) {
141 __asm __volatile(
142 "movq %0, %%mm0\n\t"
143 "movq %1, %%mm1\n\t"
144 PAVGB" %%mm1, %%mm0\n\t"
145 "movq %%mm0, %0\n\t"
146 :"=m"(*p)
147 :"m"(*pix)
148 :"memory");
149 pix += line_size; p += line_size;
150 }
151 emms();
152 }
153
154 static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
155 {
156 int dh, hh;
157 UINT8 *p;
158 const UINT8 *pix;
159 p = block;
160 pix = pixels;
161 hh=h>>1;
162 dh=h&1;
163 while(hh--) {
164 __asm __volatile(
165 "movq %2, %%mm2\n\t"
166 "movq 1%2, %%mm3\n\t"
167 "movq %3, %%mm4\n\t"
168 "movq 1%3, %%mm5\n\t"
169 "movq %0, %%mm0\n\t"
170 "movq %1, %%mm1\n\t"
171 PAVGB" %%mm3, %%mm2\n\t"
172 PAVGB" %%mm2, %%mm0\n\t"
173 PAVGB" %%mm5, %%mm4\n\t"
174 PAVGB" %%mm4, %%mm1\n\t"
175 "movq %%mm0, %0\n\t"
176 "movq %%mm1, %1\n\t"
177 :"=m"(*p), "=m"(*(p+line_size))
178 :"m"(*pix), "m"(*(pix+line_size))
179 :"memory");
180 pix += line_size*2;
181 p += line_size*2;
182 }
183 if(dh) {
184 __asm __volatile(
185 "movq %1, %%mm1\n\t"
186 "movq 1%1, %%mm2\n\t"
187 "movq %0, %%mm0\n\t"
188 PAVGB" %%mm2, %%mm1\n\t"
189 PAVGB" %%mm1, %%mm0\n\t"
190 "movq %%mm0, %0\n\t"
191 :"=m"(*p)
192 :"m"(*pix)
193 :"memory");
194 }
195 emms();
196 }
197
198 static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
199 {
200 int dh, hh;
201 UINT8 *p;
202 const UINT8 *pix;
203 p = block;
204 pix = pixels;
205 hh=h>>1;
206 dh=h&1;
207 while(hh--) {
208 __asm __volatile(
209 "movq %2, %%mm2\n\t"
210 "movq %3, %%mm3\n\t"
211 "movq %3, %%mm4\n\t"
212 "movq %4, %%mm5\n\t"
213 "movq %0, %%mm0\n\t"
214 "movq %1, %%mm1\n\t"
215 PAVGB" %%mm3, %%mm2\n\t"
216 PAVGB" %%mm2, %%mm0\n\t"
217 PAVGB" %%mm5, %%mm4\n\t"
218 PAVGB" %%mm4, %%mm1\n\t"
219 "movq %%mm0, %0\n\t"
220 "movq %%mm1, %1\n\t"
221 :"=m"(*p), "=m"(*(p+line_size))
222 :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
223 :"memory");
224 pix += line_size*2;
225 p += line_size*2;
226 }
227 if(dh) {
228 __asm __volatile(
229 "movq %1, %%mm1\n\t"
230 "movq %2, %%mm2\n\t"
231 "movq %0, %%mm0\n\t"
232 PAVGB" %%mm2, %%mm1\n\t"
233 PAVGB" %%mm1, %%mm0\n\t"
234 "movq %%mm0, %0\n\t"
235 :"=m"(*p)
236 :"m"(*pix), "m"(*(pix+line_size))
237 :"memory");
238 }
239 emms();
240 }
241
242 static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h)
243 {
244 UINT8 *p;
245 const UINT8 *pix;
246 p = block;
247 pix = pixels;
248 __asm __volatile(
249 "pxor %%mm7, %%mm7\n\t"
250 "movq %0, %%mm6\n\t"
251 ::"m"(mm_wtwo[0]):"memory");
252 do {
253 __asm __volatile(
254 "movq %1, %%mm0\n\t"
255 "movq %2, %%mm1\n\t"
256 "movq 1%1, %%mm4\n\t"
257 "movq 1%2, %%mm5\n\t"
258 "movq %%mm0, %%mm2\n\t"
259 "movq %%mm1, %%mm3\n\t"
260 "punpcklbw %%mm7, %%mm0\n\t"
261 "punpcklbw %%mm7, %%mm1\n\t"
262 "punpckhbw %%mm7, %%mm2\n\t"
263 "punpckhbw %%mm7, %%mm3\n\t"
264 "paddusw %%mm1, %%mm0\n\t"
265 "paddusw %%mm3, %%mm2\n\t"
266 "movq %%mm4, %%mm1\n\t"
267 "movq %%mm5, %%mm3\n\t"
268 "punpcklbw %%mm7, %%mm4\n\t"
269 "punpcklbw %%mm7, %%mm5\n\t"
270 "punpckhbw %%mm7, %%mm1\n\t"
271 "punpckhbw %%mm7, %%mm3\n\t"
272 "paddusw %%mm5, %%mm4\n\t"
273 "paddusw %%mm3, %%mm1\n\t"
274 "paddusw %%mm6, %%mm4\n\t"
275 "paddusw %%mm6, %%mm1\n\t"
276 "paddusw %%mm4, %%mm0\n\t"
277 "paddusw %%mm1, %%mm2\n\t"
278 "psrlw $2, %%mm0\n\t"
279 "psrlw $2, %%mm2\n\t"
280 "packuswb %%mm2, %%mm0\n\t"
281 PAVGB" %0, %%mm0\n\t"
282 "movq %%mm0, %0\n\t"
283 :"=m"(*p)
284 :"m"(*pix),
285 "m"(*(pix+line_size))
286 :"memory");
287 pix += line_size;
288 p += line_size ;
289 } while(--h);
290 emms();
291 }
292
293 static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
294 {
295 DCTELEM *p;
296 const UINT8 *pix;
297 p = block;
298 pix = pixels;
299 __asm __volatile(
300 "pxor %%mm7, %%mm7":::"memory");
301 do {
302 __asm __volatile(
303 "movq 1%1, %%mm2\n\t"
304 "movq %0, %%mm0\n\t"
305 PAVGB" %1, %%mm2\n\t"
306 "movq 8%0, %%mm1\n\t"
307 "movq %%mm2, %%mm3\n\t"
308 "punpcklbw %%mm7, %%mm2\n\t"
309 "punpckhbw %%mm7, %%mm3\n\t"
310 "psubsw %%mm2, %%mm0\n\t"
311 "psubsw %%mm3, %%mm1\n\t"
312 "movq %%mm0, %0\n\t"
313 "movq %%mm1, 8%0\n\t"
314 :"=m"(*p)
315 :"m"(*pix)
316 :"memory");
317 pix += line_size;
318 p += 8;
319 } while (--h);
320 emms();
321 }
322
323 static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
324 {
325 DCTELEM *p;
326 const UINT8 *pix;
327 p = block;
328 pix = pixels;
329 __asm __volatile(
330 "pxor %%mm7, %%mm7":::"memory");
331 do {
332 __asm __volatile(
333 "movq %2, %%mm2\n\t"
334 "movq %0, %%mm0\n\t"
335 PAVGB" %1, %%mm2\n\t"
336 "movq 8%0, %%mm1\n\t"
337 "movq %%mm2, %%mm3\n\t"
338 "punpcklbw %%mm7, %%mm2\n\t"
339 "punpckhbw %%mm7, %%mm3\n\t"
340 "psubsw %%mm2, %%mm0\n\t"
341 "psubsw %%mm3, %%mm1\n\t"
342 "movq %%mm0, %0\n\t"
343 "movq %%mm1, 8%0\n\t"
344 :"=m"(*p)
345 :"m"(*pix), "m"(*(pix+line_size))
346 :"memory");
347 pix += line_size;
348 p += 8;
349 } while (--h);
350 emms();
351 }
352