dsputil.h is a local header
[libav.git] / libavcodec / i386 / dsputil_mmx.c
CommitLineData
de6d9b64
FB
1/*
2 * MMX optimized DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
b78e7197
DB
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
FB
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
de6d9b64 12 *
b78e7197 13 * FFmpeg is distributed in the hope that it will be useful,
de6d9b64 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
de6d9b64 17 *
ff4ec49e 18 * You should have received a copy of the GNU Lesser General Public
b78e7197 19 * License along with FFmpeg; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
de6d9b64
FB
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
b550bfaa 25#include "dsputil.h"
182f56cb 26#include "dsputil_mmx.h"
b550bfaa
RB
27#include "simple_idct.h"
28#include "mpegvideo.h"
9c39071d 29#include "x86_cpu.h"
f9ed9d85 30#include "mmx.h"
5b0b7054
AJ
31#include "vp3dsp_mmx.h"
32#include "vp3dsp_sse2.h"
eb75a698 33#include "h263.h"
de6d9b64 34
622348f9
MN
35//#undef NDEBUG
36//#include <assert.h>
37
84740d59
MN
38extern void ff_idct_xvid_mmx(short *block);
39extern void ff_idct_xvid_mmx2(short *block);
359f98de 40
486497e0 41int mm_flags; /* multimedia extension flags */
1457ab52 42
de6d9b64 43/* pixel operations */
43de5065 44DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
43de5065 45DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
de6d9b64 46
182f56cb 47DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
21bb884f
MN
48{0x8000000080000000ULL, 0x8000000080000000ULL};
49
182f56cb
AJ
50DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
51DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
ddf96970 52DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
182f56cb
AJ
53DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
54DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
ddf96970 55DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
182f56cb 56DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
fa9b873e 57DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
182f56cb
AJ
58DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
59DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
60DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
a2b7bc8e 61DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
4a9ca0a2 62DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
826f429a 63
182f56cb
AJ
64DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
65DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
66DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
67DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
182f56cb
AJ
68DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
69DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
359f98de 70
182f56cb
AJ
71DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
72DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
6810b93a 73
78d3d94f
RD
74#define JUMPALIGN() asm volatile (ASMALIGN(3)::)
75#define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
d6a4c0b1 76
fca0f0e5 77#define MOVQ_BFE(regd) \
78d3d94f 78 asm volatile ( \
fca0f0e5
ZK
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
80 "paddb %%" #regd ", %%" #regd " \n\t" ::)
81
d6a4c0b1 82#ifndef PIC
78d3d94f
RD
83#define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
84#define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
d6a4c0b1
ZK
85#else
86// for shared library it's better to use this way for accessing constants
87// pcmpeqd -> -1
fca0f0e5 88#define MOVQ_BONE(regd) \
78d3d94f 89 asm volatile ( \
fca0f0e5
ZK
90 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
91 "psrlw $15, %%" #regd " \n\t" \
92 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
d6a4c0b1
ZK
93
94#define MOVQ_WTWO(regd) \
78d3d94f 95 asm volatile ( \
fca0f0e5
ZK
96 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
97 "psrlw $15, %%" #regd " \n\t" \
98 "psllw $1, %%" #regd " \n\t"::)
a7bd8797 99
d6a4c0b1
ZK
100#endif
101
fca0f0e5 102// using regr as temporary and for the output result
def60345 103// first argument is unmodifed and second is trashed
39825f31
ZK
104// regfe is supposed to contain 0xfefefefefefefefe
105#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
bb270c08
DB
106 "movq " #rega ", " #regr " \n\t"\
107 "pand " #regb ", " #regr " \n\t"\
108 "pxor " #rega ", " #regb " \n\t"\
109 "pand " #regfe "," #regb " \n\t"\
110 "psrlq $1, " #regb " \n\t"\
111 "paddb " #regb ", " #regr " \n\t"
def60345 112
39825f31 113#define PAVGB_MMX(rega, regb, regr, regfe) \
bb270c08
DB
114 "movq " #rega ", " #regr " \n\t"\
115 "por " #regb ", " #regr " \n\t"\
116 "pxor " #rega ", " #regb " \n\t"\
117 "pand " #regfe "," #regb " \n\t"\
118 "psrlq $1, " #regb " \n\t"\
119 "psubb " #regb ", " #regr " \n\t"
def60345 120
39825f31 121// mm6 is supposed to contain 0xfefefefefefefefe
6aa6ea8e 122#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
bb270c08
DB
123 "movq " #rega ", " #regr " \n\t"\
124 "movq " #regc ", " #regp " \n\t"\
125 "pand " #regb ", " #regr " \n\t"\
126 "pand " #regd ", " #regp " \n\t"\
127 "pxor " #rega ", " #regb " \n\t"\
128 "pxor " #regc ", " #regd " \n\t"\
129 "pand %%mm6, " #regb " \n\t"\
130 "pand %%mm6, " #regd " \n\t"\
131 "psrlq $1, " #regb " \n\t"\
132 "psrlq $1, " #regd " \n\t"\
133 "paddb " #regb ", " #regr " \n\t"\
134 "paddb " #regd ", " #regp " \n\t"
6aa6ea8e
ZK
135
136#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
bb270c08
DB
137 "movq " #rega ", " #regr " \n\t"\
138 "movq " #regc ", " #regp " \n\t"\
139 "por " #regb ", " #regr " \n\t"\
140 "por " #regd ", " #regp " \n\t"\
141 "pxor " #rega ", " #regb " \n\t"\
142 "pxor " #regc ", " #regd " \n\t"\
143 "pand %%mm6, " #regb " \n\t"\
144 "pand %%mm6, " #regd " \n\t"\
145 "psrlq $1, " #regd " \n\t"\
146 "psrlq $1, " #regb " \n\t"\
147 "psubb " #regb ", " #regr " \n\t"\
148 "psubb " #regd ", " #regp " \n\t"
6aa6ea8e 149
91abb473
ZK
150/***********************************/
151/* MMX no rounding */
152#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
fca0f0e5 153#define SET_RND MOVQ_WONE
bb270c08
DB
154#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
155#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
fca0f0e5 156
91abb473
ZK
157#include "dsputil_mmx_rnd.h"
158
159#undef DEF
fca0f0e5 160#undef SET_RND
6aa6ea8e 161#undef PAVGBP
39825f31 162#undef PAVGB
91abb473
ZK
163/***********************************/
164/* MMX rounding */
165
166#define DEF(x, y) x ## _ ## y ##_mmx
fca0f0e5 167#define SET_RND MOVQ_WTWO
bb270c08
DB
168#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
169#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
fca0f0e5 170
91abb473
ZK
171#include "dsputil_mmx_rnd.h"
172
173#undef DEF
fca0f0e5 174#undef SET_RND
6aa6ea8e 175#undef PAVGBP
39825f31 176#undef PAVGB
a7bd8797 177
de6d9b64
FB
178/***********************************/
179/* 3Dnow specific */
180
181#define DEF(x) x ## _3dnow
de6d9b64
FB
182#define PAVGB "pavgusb"
183
184#include "dsputil_mmx_avg.h"
185
186#undef DEF
187#undef PAVGB
188
189/***********************************/
190/* MMX2 specific */
191
607dce96 192#define DEF(x) x ## _mmx2
de6d9b64
FB
193
194/* Introduced only in MMX2 set */
195#define PAVGB "pavgb"
196
197#include "dsputil_mmx_avg.h"
198
199#undef DEF
200#undef PAVGB
201
b2f77586
LM
202#define put_no_rnd_pixels16_mmx put_pixels16_mmx
203#define put_no_rnd_pixels8_mmx put_pixels8_mmx
6c01d006
LM
204#define put_pixels16_mmx2 put_pixels16_mmx
205#define put_pixels8_mmx2 put_pixels8_mmx
206#define put_pixels4_mmx2 put_pixels4_mmx
6c01d006
LM
207#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
208#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
209#define put_pixels16_3dnow put_pixels16_mmx
210#define put_pixels8_3dnow put_pixels8_mmx
211#define put_pixels4_3dnow put_pixels4_mmx
6c01d006
LM
212#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
213#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
214
de6d9b64
FB
215/***********************************/
216/* standard MMX */
217
0c1a9eda 218void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
219{
220 const DCTELEM *p;
0c1a9eda 221 uint8_t *pix;
de6d9b64
FB
222
223 /* read the pixels */
224 p = block;
225 pix = pixels;
d6a4c0b1 226 /* unrolled loop */
78d3d94f 227 asm volatile(
bb270c08
DB
228 "movq %3, %%mm0 \n\t"
229 "movq 8%3, %%mm1 \n\t"
230 "movq 16%3, %%mm2 \n\t"
231 "movq 24%3, %%mm3 \n\t"
232 "movq 32%3, %%mm4 \n\t"
233 "movq 40%3, %%mm5 \n\t"
234 "movq 48%3, %%mm6 \n\t"
235 "movq 56%3, %%mm7 \n\t"
236 "packuswb %%mm1, %%mm0 \n\t"
237 "packuswb %%mm3, %%mm2 \n\t"
238 "packuswb %%mm5, %%mm4 \n\t"
239 "packuswb %%mm7, %%mm6 \n\t"
240 "movq %%mm0, (%0) \n\t"
241 "movq %%mm2, (%0, %1) \n\t"
242 "movq %%mm4, (%0, %1, 2) \n\t"
243 "movq %%mm6, (%0, %2) \n\t"
244 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
245 :"memory");
de6d9b64
FB
246 pix += line_size*4;
247 p += 32;
d6a4c0b1
ZK
248
249 // if here would be an exact copy of the code above
250 // compiler would generate some very strange code
251 // thus using "r"
78d3d94f 252 asm volatile(
bb270c08
DB
253 "movq (%3), %%mm0 \n\t"
254 "movq 8(%3), %%mm1 \n\t"
255 "movq 16(%3), %%mm2 \n\t"
256 "movq 24(%3), %%mm3 \n\t"
257 "movq 32(%3), %%mm4 \n\t"
258 "movq 40(%3), %%mm5 \n\t"
259 "movq 48(%3), %%mm6 \n\t"
260 "movq 56(%3), %%mm7 \n\t"
261 "packuswb %%mm1, %%mm0 \n\t"
262 "packuswb %%mm3, %%mm2 \n\t"
263 "packuswb %%mm5, %%mm4 \n\t"
264 "packuswb %%mm7, %%mm6 \n\t"
265 "movq %%mm0, (%0) \n\t"
266 "movq %%mm2, (%0, %1) \n\t"
267 "movq %%mm4, (%0, %1, 2) \n\t"
268 "movq %%mm6, (%0, %2) \n\t"
269 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
270 :"memory");
de6d9b64
FB
271}
272
68b51e58 273static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
7daabccb
MM
274 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
275
f9ed9d85
MM
276void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
277{
278 int i;
f9ed9d85
MM
279
280 movq_m2r(*vector128, mm1);
281 for (i = 0; i < 8; i++) {
282 movq_m2r(*(block), mm0);
283 packsswb_m2r(*(block + 4), mm0);
284 block += 8;
285 paddb_r2r(mm1, mm0);
286 movq_r2m(mm0, *pixels);
287 pixels += line_size;
288 }
289}
290
0c1a9eda 291void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
de6d9b64
FB
292{
293 const DCTELEM *p;
0c1a9eda 294 uint8_t *pix;
de6d9b64
FB
295 int i;
296
297 /* read the pixels */
298 p = block;
299 pix = pixels;
d6a4c0b1
ZK
300 MOVQ_ZERO(mm7);
301 i = 4;
cd8e5f96 302 do {
78d3d94f 303 asm volatile(
bb270c08
DB
304 "movq (%2), %%mm0 \n\t"
305 "movq 8(%2), %%mm1 \n\t"
306 "movq 16(%2), %%mm2 \n\t"
307 "movq 24(%2), %%mm3 \n\t"
308 "movq %0, %%mm4 \n\t"
309 "movq %1, %%mm6 \n\t"
310 "movq %%mm4, %%mm5 \n\t"
311 "punpcklbw %%mm7, %%mm4 \n\t"
312 "punpckhbw %%mm7, %%mm5 \n\t"
313 "paddsw %%mm4, %%mm0 \n\t"
314 "paddsw %%mm5, %%mm1 \n\t"
315 "movq %%mm6, %%mm5 \n\t"
316 "punpcklbw %%mm7, %%mm6 \n\t"
317 "punpckhbw %%mm7, %%mm5 \n\t"
318 "paddsw %%mm6, %%mm2 \n\t"
319 "paddsw %%mm5, %%mm3 \n\t"
320 "packuswb %%mm1, %%mm0 \n\t"
321 "packuswb %%mm3, %%mm2 \n\t"
322 "movq %%mm0, %0 \n\t"
323 "movq %%mm2, %1 \n\t"
324 :"+m"(*pix), "+m"(*(pix+line_size))
325 :"r"(p)
326 :"memory");
de6d9b64
FB
327 pix += line_size*2;
328 p += 16;
cd8e5f96 329 } while (--i);
de6d9b64
FB
330}
331
437525c4
MN
332static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
333{
78d3d94f 334 asm volatile(
bb270c08 335 "lea (%3, %3), %%"REG_a" \n\t"
4454dc1b 336 ASMALIGN(3)
bb270c08
DB
337 "1: \n\t"
338 "movd (%1), %%mm0 \n\t"
339 "movd (%1, %3), %%mm1 \n\t"
340 "movd %%mm0, (%2) \n\t"
341 "movd %%mm1, (%2, %3) \n\t"
342 "add %%"REG_a", %1 \n\t"
343 "add %%"REG_a", %2 \n\t"
344 "movd (%1), %%mm0 \n\t"
345 "movd (%1, %3), %%mm1 \n\t"
346 "movd %%mm0, (%2) \n\t"
347 "movd %%mm1, (%2, %3) \n\t"
348 "add %%"REG_a", %1 \n\t"
349 "add %%"REG_a", %2 \n\t"
350 "subl $4, %0 \n\t"
351 "jnz 1b \n\t"
352 : "+g"(h), "+r" (pixels), "+r" (block)
353 : "r"((long)line_size)
354 : "%"REG_a, "memory"
355 );
437525c4
MN
356}
357
0c1a9eda 358static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
de6d9b64 359{
78d3d94f 360 asm volatile(
bb270c08 361 "lea (%3, %3), %%"REG_a" \n\t"
4454dc1b 362 ASMALIGN(3)
bb270c08
DB
363 "1: \n\t"
364 "movq (%1), %%mm0 \n\t"
365 "movq (%1, %3), %%mm1 \n\t"
366 "movq %%mm0, (%2) \n\t"
367 "movq %%mm1, (%2, %3) \n\t"
368 "add %%"REG_a", %1 \n\t"
369 "add %%"REG_a", %2 \n\t"
370 "movq (%1), %%mm0 \n\t"
371 "movq (%1, %3), %%mm1 \n\t"
372 "movq %%mm0, (%2) \n\t"
373 "movq %%mm1, (%2, %3) \n\t"
374 "add %%"REG_a", %1 \n\t"
375 "add %%"REG_a", %2 \n\t"
376 "subl $4, %0 \n\t"
377 "jnz 1b \n\t"
378 : "+g"(h), "+r" (pixels), "+r" (block)
379 : "r"((long)line_size)
380 : "%"REG_a, "memory"
381 );
de6d9b64
FB
382}
383
0c1a9eda 384static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
b3184779 385{
78d3d94f 386 asm volatile(
bb270c08 387 "lea (%3, %3), %%"REG_a" \n\t"
4454dc1b 388 ASMALIGN(3)
bb270c08
DB
389 "1: \n\t"
390 "movq (%1), %%mm0 \n\t"
391 "movq 8(%1), %%mm4 \n\t"
392 "movq (%1, %3), %%mm1 \n\t"
393 "movq 8(%1, %3), %%mm5 \n\t"
394 "movq %%mm0, (%2) \n\t"
395 "movq %%mm4, 8(%2) \n\t"
396 "movq %%mm1, (%2, %3) \n\t"
397 "movq %%mm5, 8(%2, %3) \n\t"
398 "add %%"REG_a", %1 \n\t"
399 "add %%"REG_a", %2 \n\t"
400 "movq (%1), %%mm0 \n\t"
401 "movq 8(%1), %%mm4 \n\t"
402 "movq (%1, %3), %%mm1 \n\t"
403 "movq 8(%1, %3), %%mm5 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm4, 8(%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "movq %%mm5, 8(%2, %3) \n\t"
408 "add %%"REG_a", %1 \n\t"
409 "add %%"REG_a", %2 \n\t"
410 "subl $4, %0 \n\t"
411 "jnz 1b \n\t"
412 : "+g"(h), "+r" (pixels), "+r" (block)
413 : "r"((long)line_size)
414 : "%"REG_a, "memory"
415 );
b3184779
MN
416}
417
ddf96970
LM
418static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
419{
78d3d94f 420 asm volatile(
ddf96970
LM
421 "1: \n\t"
422 "movdqu (%1), %%xmm0 \n\t"
423 "movdqu (%1,%3), %%xmm1 \n\t"
424 "movdqu (%1,%3,2), %%xmm2 \n\t"
425 "movdqu (%1,%4), %%xmm3 \n\t"
426 "movdqa %%xmm0, (%2) \n\t"
427 "movdqa %%xmm1, (%2,%3) \n\t"
428 "movdqa %%xmm2, (%2,%3,2) \n\t"
429 "movdqa %%xmm3, (%2,%4) \n\t"
430 "subl $4, %0 \n\t"
431 "lea (%1,%3,4), %1 \n\t"
432 "lea (%2,%3,4), %2 \n\t"
433 "jnz 1b \n\t"
434 : "+g"(h), "+r" (pixels), "+r" (block)
435 : "r"((long)line_size), "r"(3L*line_size)
436 : "memory"
437 );
438}
439
440static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
441{
78d3d94f 442 asm volatile(
ddf96970
LM
443 "1: \n\t"
444 "movdqu (%1), %%xmm0 \n\t"
445 "movdqu (%1,%3), %%xmm1 \n\t"
446 "movdqu (%1,%3,2), %%xmm2 \n\t"
447 "movdqu (%1,%4), %%xmm3 \n\t"
448 "pavgb (%2), %%xmm0 \n\t"
449 "pavgb (%2,%3), %%xmm1 \n\t"
450 "pavgb (%2,%3,2), %%xmm2 \n\t"
451 "pavgb (%2,%4), %%xmm3 \n\t"
452 "movdqa %%xmm0, (%2) \n\t"
453 "movdqa %%xmm1, (%2,%3) \n\t"
454 "movdqa %%xmm2, (%2,%3,2) \n\t"
455 "movdqa %%xmm3, (%2,%4) \n\t"
456 "subl $4, %0 \n\t"
457 "lea (%1,%3,4), %1 \n\t"
458 "lea (%2,%3,4), %2 \n\t"
459 "jnz 1b \n\t"
460 : "+g"(h), "+r" (pixels), "+r" (block)
461 : "r"((long)line_size), "r"(3L*line_size)
462 : "memory"
463 );
464}
465
649c00c9
MN
466static void clear_blocks_mmx(DCTELEM *blocks)
467{
78d3d94f 468 asm volatile(
bb270c08
DB
469 "pxor %%mm7, %%mm7 \n\t"
470 "mov $-128*6, %%"REG_a" \n\t"
471 "1: \n\t"
472 "movq %%mm7, (%0, %%"REG_a") \n\t"
473 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
474 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
475 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
476 "add $32, %%"REG_a" \n\t"
477 " js 1b \n\t"
053dea12
AJ
478 : : "r" (((uint8_t *)blocks)+128*6)
479 : "%"REG_a
649c00c9
MN
480 );
481}
482
11f18faf 483static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
053dea12 484 long i=0;
11f18faf 485 asm volatile(
bb270c08
DB
486 "1: \n\t"
487 "movq (%1, %0), %%mm0 \n\t"
488 "movq (%2, %0), %%mm1 \n\t"
489 "paddb %%mm0, %%mm1 \n\t"
490 "movq %%mm1, (%2, %0) \n\t"
491 "movq 8(%1, %0), %%mm0 \n\t"
492 "movq 8(%2, %0), %%mm1 \n\t"
493 "paddb %%mm0, %%mm1 \n\t"
494 "movq %%mm1, 8(%2, %0) \n\t"
495 "add $16, %0 \n\t"
496 "cmp %3, %0 \n\t"
497 " jb 1b \n\t"
11f18faf 498 : "+r" (i)
053dea12 499 : "r"(src), "r"(dst), "r"((long)w-15)
11f18faf
MN
500 );
501 for(; i<w; i++)
502 dst[i+0] += src[i+0];
503}
504
4a9ca0a2
LM
505static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
506 long i=0;
507 asm volatile(
508 "1: \n\t"
509 "movq (%2, %0), %%mm0 \n\t"
510 "movq 8(%2, %0), %%mm1 \n\t"
511 "paddb (%3, %0), %%mm0 \n\t"
512 "paddb 8(%3, %0), %%mm1 \n\t"
513 "movq %%mm0, (%1, %0) \n\t"
514 "movq %%mm1, 8(%1, %0) \n\t"
515 "add $16, %0 \n\t"
516 "cmp %4, %0 \n\t"
517 " jb 1b \n\t"
518 : "+r" (i)
519 : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
520 );
521 for(; i<w; i++)
522 dst[i] = src1[i] + src2[i];
523}
524
3615e2be 525#define H263_LOOP_FILTER \
bb270c08
DB
526 "pxor %%mm7, %%mm7 \n\t"\
527 "movq %0, %%mm0 \n\t"\
528 "movq %0, %%mm1 \n\t"\
529 "movq %3, %%mm2 \n\t"\
530 "movq %3, %%mm3 \n\t"\
531 "punpcklbw %%mm7, %%mm0 \n\t"\
532 "punpckhbw %%mm7, %%mm1 \n\t"\
533 "punpcklbw %%mm7, %%mm2 \n\t"\
534 "punpckhbw %%mm7, %%mm3 \n\t"\
535 "psubw %%mm2, %%mm0 \n\t"\
536 "psubw %%mm3, %%mm1 \n\t"\
537 "movq %1, %%mm2 \n\t"\
538 "movq %1, %%mm3 \n\t"\
539 "movq %2, %%mm4 \n\t"\
540 "movq %2, %%mm5 \n\t"\
541 "punpcklbw %%mm7, %%mm2 \n\t"\
542 "punpckhbw %%mm7, %%mm3 \n\t"\
543 "punpcklbw %%mm7, %%mm4 \n\t"\
544 "punpckhbw %%mm7, %%mm5 \n\t"\
545 "psubw %%mm2, %%mm4 \n\t"\
546 "psubw %%mm3, %%mm5 \n\t"\
547 "psllw $2, %%mm4 \n\t"\
548 "psllw $2, %%mm5 \n\t"\
549 "paddw %%mm0, %%mm4 \n\t"\
550 "paddw %%mm1, %%mm5 \n\t"\
551 "pxor %%mm6, %%mm6 \n\t"\
552 "pcmpgtw %%mm4, %%mm6 \n\t"\
553 "pcmpgtw %%mm5, %%mm7 \n\t"\
554 "pxor %%mm6, %%mm4 \n\t"\
555 "pxor %%mm7, %%mm5 \n\t"\
556 "psubw %%mm6, %%mm4 \n\t"\
557 "psubw %%mm7, %%mm5 \n\t"\
558 "psrlw $3, %%mm4 \n\t"\
559 "psrlw $3, %%mm5 \n\t"\
560 "packuswb %%mm5, %%mm4 \n\t"\
561 "packsswb %%mm7, %%mm6 \n\t"\
562 "pxor %%mm7, %%mm7 \n\t"\
563 "movd %4, %%mm2 \n\t"\
564 "punpcklbw %%mm2, %%mm2 \n\t"\
565 "punpcklbw %%mm2, %%mm2 \n\t"\
566 "punpcklbw %%mm2, %%mm2 \n\t"\
567 "psubusb %%mm4, %%mm2 \n\t"\
568 "movq %%mm2, %%mm3 \n\t"\
569 "psubusb %%mm4, %%mm3 \n\t"\
570 "psubb %%mm3, %%mm2 \n\t"\
571 "movq %1, %%mm3 \n\t"\
572 "movq %2, %%mm4 \n\t"\
573 "pxor %%mm6, %%mm3 \n\t"\
574 "pxor %%mm6, %%mm4 \n\t"\
575 "paddusb %%mm2, %%mm3 \n\t"\
576 "psubusb %%mm2, %%mm4 \n\t"\
577 "pxor %%mm6, %%mm3 \n\t"\
578 "pxor %%mm6, %%mm4 \n\t"\
579 "paddusb %%mm2, %%mm2 \n\t"\
580 "packsswb %%mm1, %%mm0 \n\t"\
581 "pcmpgtb %%mm0, %%mm7 \n\t"\
582 "pxor %%mm7, %%mm0 \n\t"\
583 "psubb %%mm7, %%mm0 \n\t"\
584 "movq %%mm0, %%mm1 \n\t"\
585 "psubusb %%mm2, %%mm0 \n\t"\
586 "psubb %%mm0, %%mm1 \n\t"\
587 "pand %5, %%mm1 \n\t"\
588 "psrlw $2, %%mm1 \n\t"\
589 "pxor %%mm7, %%mm1 \n\t"\
590 "psubb %%mm7, %%mm1 \n\t"\
591 "movq %0, %%mm5 \n\t"\
592 "movq %3, %%mm6 \n\t"\
593 "psubb %%mm1, %%mm5 \n\t"\
594 "paddb %%mm1, %%mm6 \n\t"
3615e2be 595
359f98de 596static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
73f51a4d 597 if(ENABLE_ANY_H263) {
359f98de
MN
598 const int strength= ff_h263_loop_filter_strength[qscale];
599
600 asm volatile(
115329f1 601
3615e2be 602 H263_LOOP_FILTER
115329f1 603
bb270c08
DB
604 "movq %%mm3, %1 \n\t"
605 "movq %%mm4, %2 \n\t"
606 "movq %%mm5, %0 \n\t"
607 "movq %%mm6, %3 \n\t"
359f98de
MN
608 : "+m" (*(uint64_t*)(src - 2*stride)),
609 "+m" (*(uint64_t*)(src - 1*stride)),
610 "+m" (*(uint64_t*)(src + 0*stride)),
611 "+m" (*(uint64_t*)(src + 1*stride))
612 : "g" (2*strength), "m"(ff_pb_FC)
613 );
73f51a4d 614 }
359f98de
MN
615}
616
3615e2be
MN
617static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
618 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
bb270c08
DB
619 "movd %4, %%mm0 \n\t"
620 "movd %5, %%mm1 \n\t"
621 "movd %6, %%mm2 \n\t"
622 "movd %7, %%mm3 \n\t"
623 "punpcklbw %%mm1, %%mm0 \n\t"
624 "punpcklbw %%mm3, %%mm2 \n\t"
625 "movq %%mm0, %%mm1 \n\t"
626 "punpcklwd %%mm2, %%mm0 \n\t"
627 "punpckhwd %%mm2, %%mm1 \n\t"
628 "movd %%mm0, %0 \n\t"
629 "punpckhdq %%mm0, %%mm0 \n\t"
630 "movd %%mm0, %1 \n\t"
631 "movd %%mm1, %2 \n\t"
632 "punpckhdq %%mm1, %%mm1 \n\t"
633 "movd %%mm1, %3 \n\t"
115329f1 634
3615e2be
MN
635 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
636 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
637 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
638 "=m" (*(uint32_t*)(dst + 3*dst_stride))
639 : "m" (*(uint32_t*)(src + 0*src_stride)),
640 "m" (*(uint32_t*)(src + 1*src_stride)),
641 "m" (*(uint32_t*)(src + 2*src_stride)),
642 "m" (*(uint32_t*)(src + 3*src_stride))
643 );
644}
645
646static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
73f51a4d 647 if(ENABLE_ANY_H263) {
3615e2be 648 const int strength= ff_h263_loop_filter_strength[qscale];
27215c6b 649 DECLARE_ALIGNED(8, uint64_t, temp[4]);
3615e2be 650 uint8_t *btemp= (uint8_t*)temp;
115329f1 651
3615e2be
MN
652 src -= 2;
653
654 transpose4x4(btemp , src , 8, stride);
655 transpose4x4(btemp+4, src + 4*stride, 8, stride);
656 asm volatile(
657 H263_LOOP_FILTER // 5 3 4 6
115329f1 658
3615e2be
MN
659 : "+m" (temp[0]),
660 "+m" (temp[1]),
661 "+m" (temp[2]),
662 "+m" (temp[3])
663 : "g" (2*strength), "m"(ff_pb_FC)
664 );
665
666 asm volatile(
bb270c08
DB
667 "movq %%mm5, %%mm1 \n\t"
668 "movq %%mm4, %%mm0 \n\t"
669 "punpcklbw %%mm3, %%mm5 \n\t"
670 "punpcklbw %%mm6, %%mm4 \n\t"
671 "punpckhbw %%mm3, %%mm1 \n\t"
672 "punpckhbw %%mm6, %%mm0 \n\t"
673 "movq %%mm5, %%mm3 \n\t"
674 "movq %%mm1, %%mm6 \n\t"
675 "punpcklwd %%mm4, %%mm5 \n\t"
676 "punpcklwd %%mm0, %%mm1 \n\t"
677 "punpckhwd %%mm4, %%mm3 \n\t"
678 "punpckhwd %%mm0, %%mm6 \n\t"
679 "movd %%mm5, (%0) \n\t"
680 "punpckhdq %%mm5, %%mm5 \n\t"
681 "movd %%mm5, (%0,%2) \n\t"
682 "movd %%mm3, (%0,%2,2) \n\t"
683 "punpckhdq %%mm3, %%mm3 \n\t"
684 "movd %%mm3, (%0,%3) \n\t"
685 "movd %%mm1, (%1) \n\t"
686 "punpckhdq %%mm1, %%mm1 \n\t"
687 "movd %%mm1, (%1,%2) \n\t"
688 "movd %%mm6, (%1,%2,2) \n\t"
689 "punpckhdq %%mm6, %%mm6 \n\t"
690 "movd %%mm6, (%1,%3) \n\t"
4d9ae03b
MD
691 :: "r" (src),
692 "r" (src + 4*stride),
693 "r" ((long) stride ),
694 "r" ((long)(3*stride))
3615e2be 695 );
73f51a4d 696 }
3615e2be
MN
697}
698
4a9ca0a2
LM
699#define PAETH(cpu, abs3)\
700void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
701{\
702 long i = -bpp;\
703 long end = w-3;\
704 asm volatile(\
705 "pxor %%mm7, %%mm7 \n"\
706 "movd (%1,%0), %%mm0 \n"\
707 "movd (%2,%0), %%mm1 \n"\
708 "punpcklbw %%mm7, %%mm0 \n"\
709 "punpcklbw %%mm7, %%mm1 \n"\
710 "add %4, %0 \n"\
711 "1: \n"\
712 "movq %%mm1, %%mm2 \n"\
713 "movd (%2,%0), %%mm1 \n"\
714 "movq %%mm2, %%mm3 \n"\
715 "punpcklbw %%mm7, %%mm1 \n"\
716 "movq %%mm2, %%mm4 \n"\
717 "psubw %%mm1, %%mm3 \n"\
718 "psubw %%mm0, %%mm4 \n"\
719 "movq %%mm3, %%mm5 \n"\
720 "paddw %%mm4, %%mm5 \n"\
721 abs3\
722 "movq %%mm4, %%mm6 \n"\
723 "pminsw %%mm5, %%mm6 \n"\
724 "pcmpgtw %%mm6, %%mm3 \n"\
725 "pcmpgtw %%mm5, %%mm4 \n"\
726 "movq %%mm4, %%mm6 \n"\
727 "pand %%mm3, %%mm4 \n"\
728 "pandn %%mm3, %%mm6 \n"\
729 "pandn %%mm0, %%mm3 \n"\
730 "movd (%3,%0), %%mm0 \n"\
731 "pand %%mm1, %%mm6 \n"\
732 "pand %%mm4, %%mm2 \n"\
733 "punpcklbw %%mm7, %%mm0 \n"\
734 "movq %6, %%mm5 \n"\
735 "paddw %%mm6, %%mm0 \n"\
736 "paddw %%mm2, %%mm3 \n"\
737 "paddw %%mm3, %%mm0 \n"\
738 "pand %%mm5, %%mm0 \n"\
739 "movq %%mm0, %%mm3 \n"\
740 "packuswb %%mm3, %%mm3 \n"\
741 "movd %%mm3, (%1,%0) \n"\
742 "add %4, %0 \n"\
743 "cmp %5, %0 \n"\
744 "jle 1b \n"\
745 :"+r"(i)\
746 :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
747 "m"(ff_pw_255)\
748 :"memory"\
749 );\
750}
751
752#define ABS3_MMX2\
753 "psubw %%mm5, %%mm7 \n"\
754 "pmaxsw %%mm7, %%mm5 \n"\
755 "pxor %%mm6, %%mm6 \n"\
756 "pxor %%mm7, %%mm7 \n"\
757 "psubw %%mm3, %%mm6 \n"\
758 "psubw %%mm4, %%mm7 \n"\
759 "pmaxsw %%mm6, %%mm3 \n"\
760 "pmaxsw %%mm7, %%mm4 \n"\
761 "pxor %%mm7, %%mm7 \n"
762
763#define ABS3_SSSE3\
764 "pabsw %%mm3, %%mm3 \n"\
765 "pabsw %%mm4, %%mm4 \n"\
766 "pabsw %%mm5, %%mm5 \n"
767
768PAETH(mmx2, ABS3_MMX2)
769#ifdef HAVE_SSSE3
770PAETH(ssse3, ABS3_SSSE3)
771#endif
772
826f429a 773#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
bb270c08
DB
774 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
775 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
776 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
777 "movq "#in7", " #m3 " \n\t" /* d */\
778 "movq "#in0", %%mm5 \n\t" /* D */\
779 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
780 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
781 "movq "#in1", %%mm5 \n\t" /* C */\
782 "movq "#in2", %%mm6 \n\t" /* B */\
783 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
784 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
785 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
786 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
787 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
788 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
789 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
790 "psraw $5, %%mm5 \n\t"\
791 "packuswb %%mm5, %%mm5 \n\t"\
826f429a
MN
792 OP(%%mm5, out, %%mm7, d)
793
3178ee4c 794#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
5c91a675 795static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
796 uint64_t temp;\
797\
798 asm volatile(\
bb270c08
DB
799 "pxor %%mm7, %%mm7 \n\t"\
800 "1: \n\t"\
801 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
802 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
803 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
804 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
805 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
806 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
807 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
808 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
809 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
810 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
811 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
812 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
813 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
814 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
815 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
816 "paddw %%mm3, %%mm5 \n\t" /* b */\
817 "paddw %%mm2, %%mm6 \n\t" /* c */\
818 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
819 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
820 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
821 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
822 "paddw %%mm4, %%mm0 \n\t" /* a */\
823 "paddw %%mm1, %%mm5 \n\t" /* d */\
824 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
825 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
826 "paddw %6, %%mm6 \n\t"\
827 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
828 "psraw $5, %%mm0 \n\t"\
829 "movq %%mm0, %5 \n\t"\
826f429a
MN
830 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
831 \
bb270c08
DB
832 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
833 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
834 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
835 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
836 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
837 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
838 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
839 "paddw %%mm0, %%mm2 \n\t" /* b */\
840 "paddw %%mm5, %%mm3 \n\t" /* c */\
841 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
842 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
843 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
844 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
845 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
846 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
847 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
848 "paddw %%mm2, %%mm1 \n\t" /* a */\
849 "paddw %%mm6, %%mm4 \n\t" /* d */\
850 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
851 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
852 "paddw %6, %%mm1 \n\t"\
853 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
854 "psraw $5, %%mm3 \n\t"\
855 "movq %5, %%mm1 \n\t"\
856 "packuswb %%mm3, %%mm1 \n\t"\
3178ee4c 857 OP_MMX2(%%mm1, (%1),%%mm4, q)\
826f429a
MN
858 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
859 \
bb270c08
DB
860 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
861 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
862 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
863 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
864 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
865 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
866 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
867 "paddw %%mm1, %%mm5 \n\t" /* b */\
868 "paddw %%mm4, %%mm0 \n\t" /* c */\
869 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
870 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
871 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
872 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
873 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
874 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
875 "paddw %%mm3, %%mm2 \n\t" /* d */\
876 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
877 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
878 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
879 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
880 "paddw %%mm2, %%mm6 \n\t" /* a */\
881 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
882 "paddw %6, %%mm0 \n\t"\
883 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
884 "psraw $5, %%mm0 \n\t"\
826f429a
MN
885 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
886 \
bb270c08
DB
887 "paddw %%mm5, %%mm3 \n\t" /* a */\
888 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
889 "paddw %%mm4, %%mm6 \n\t" /* b */\
890 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
891 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
892 "paddw %%mm1, %%mm4 \n\t" /* c */\
893 "paddw %%mm2, %%mm5 \n\t" /* d */\
894 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
895 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
896 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
897 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
898 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
899 "paddw %6, %%mm4 \n\t"\
900 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
901 "psraw $5, %%mm4 \n\t"\
902 "packuswb %%mm4, %%mm0 \n\t"\
3178ee4c 903 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
826f429a 904 \
bb270c08
DB
905 "add %3, %0 \n\t"\
906 "add %4, %1 \n\t"\
907 "decl %2 \n\t"\
908 " jnz 1b \n\t"\
20d565be 909 : "+a"(src), "+c"(dst), "+g"(h)\
053dea12 910 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
0b093b6f 911 : "memory"\
826f429a
MN
912 );\
913}\
914\
915static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
916 int i;\
917 int16_t temp[16];\
918 /* quick HACK, XXX FIXME MUST be optimized */\
919 for(i=0; i<h; i++)\
920 {\
921 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
922 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
923 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
924 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
925 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
926 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
927 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
928 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
929 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
930 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
931 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
932 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
933 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
934 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
935 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
936 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
937 asm volatile(\
bb270c08
DB
938 "movq (%0), %%mm0 \n\t"\
939 "movq 8(%0), %%mm1 \n\t"\
940 "paddw %2, %%mm0 \n\t"\
941 "paddw %2, %%mm1 \n\t"\
942 "psraw $5, %%mm0 \n\t"\
943 "psraw $5, %%mm1 \n\t"\
944 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 945 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
bb270c08
DB
946 "movq 16(%0), %%mm0 \n\t"\
947 "movq 24(%0), %%mm1 \n\t"\
948 "paddw %2, %%mm0 \n\t"\
949 "paddw %2, %%mm1 \n\t"\
950 "psraw $5, %%mm0 \n\t"\
951 "psraw $5, %%mm1 \n\t"\
952 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 953 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
826f429a 954 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 955 : "memory"\
826f429a
MN
956 );\
957 dst+=dstStride;\
958 src+=srcStride;\
959 }\
960}\
961\
5c91a675 962static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
826f429a
MN
963 uint64_t temp;\
964\
965 asm volatile(\
bb270c08
DB
966 "pxor %%mm7, %%mm7 \n\t"\
967 "1: \n\t"\
968 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
969 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
970 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
971 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
972 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
973 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
974 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
975 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
976 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
977 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
978 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
979 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
980 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
981 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
982 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
983 "paddw %%mm3, %%mm5 \n\t" /* b */\
984 "paddw %%mm2, %%mm6 \n\t" /* c */\
985 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
986 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
987 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
988 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
989 "paddw %%mm4, %%mm0 \n\t" /* a */\
990 "paddw %%mm1, %%mm5 \n\t" /* d */\
991 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
992 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
993 "paddw %6, %%mm6 \n\t"\
994 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
995 "psraw $5, %%mm0 \n\t"\
826f429a
MN
996 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
997 \
bb270c08
DB
998 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
999 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1000 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1001 "paddw %%mm5, %%mm1 \n\t" /* a */\
1002 "paddw %%mm6, %%mm2 \n\t" /* b */\
1003 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1004 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1005 "paddw %%mm6, %%mm3 \n\t" /* c */\
1006 "paddw %%mm5, %%mm4 \n\t" /* d */\
1007 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1008 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1009 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1010 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1011 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1012 "paddw %6, %%mm1 \n\t"\
1013 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1014 "psraw $5, %%mm3 \n\t"\
1015 "packuswb %%mm3, %%mm0 \n\t"\
3178ee4c 1016 OP_MMX2(%%mm0, (%1), %%mm4, q)\
826f429a 1017 \
bb270c08
DB
1018 "add %3, %0 \n\t"\
1019 "add %4, %1 \n\t"\
1020 "decl %2 \n\t"\
1021 " jnz 1b \n\t"\
20d565be 1022 : "+a"(src), "+c"(dst), "+g"(h)\
053dea12 1023 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
0b093b6f 1024 : "memory"\
826f429a
MN
1025 );\
1026}\
1027\
1028static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1029 int i;\
1030 int16_t temp[8];\
1031 /* quick HACK, XXX FIXME MUST be optimized */\
1032 for(i=0; i<h; i++)\
1033 {\
1034 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1035 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1036 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1037 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1038 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1039 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1040 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1041 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1042 asm volatile(\
bb270c08
DB
1043 "movq (%0), %%mm0 \n\t"\
1044 "movq 8(%0), %%mm1 \n\t"\
1045 "paddw %2, %%mm0 \n\t"\
1046 "paddw %2, %%mm1 \n\t"\
1047 "psraw $5, %%mm0 \n\t"\
1048 "psraw $5, %%mm1 \n\t"\
1049 "packuswb %%mm1, %%mm0 \n\t"\
3178ee4c 1050 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
826f429a 1051 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
0b093b6f 1052 :"memory"\
826f429a
MN
1053 );\
1054 dst+=dstStride;\
1055 src+=srcStride;\
1056 }\
3178ee4c
MN
1057}
1058
1059#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1060\
1061static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1062 uint64_t temp[17*4];\
1063 uint64_t *temp_ptr= temp;\
1064 int count= 17;\
1065\
1066 /*FIXME unroll */\
1067 asm volatile(\
bb270c08
DB
1068 "pxor %%mm7, %%mm7 \n\t"\
1069 "1: \n\t"\
1070 "movq (%0), %%mm0 \n\t"\
1071 "movq (%0), %%mm1 \n\t"\
1072 "movq 8(%0), %%mm2 \n\t"\
1073 "movq 8(%0), %%mm3 \n\t"\
1074 "punpcklbw %%mm7, %%mm0 \n\t"\
1075 "punpckhbw %%mm7, %%mm1 \n\t"\
1076 "punpcklbw %%mm7, %%mm2 \n\t"\
1077 "punpckhbw %%mm7, %%mm3 \n\t"\
1078 "movq %%mm0, (%1) \n\t"\
1079 "movq %%mm1, 17*8(%1) \n\t"\
1080 "movq %%mm2, 2*17*8(%1) \n\t"\
1081 "movq %%mm3, 3*17*8(%1) \n\t"\
1082 "add $8, %1 \n\t"\
1083 "add %3, %0 \n\t"\
1084 "decl %2 \n\t"\
1085 " jnz 1b \n\t"\
3178ee4c 1086 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
053dea12 1087 : "r" ((long)srcStride)\
0b093b6f 1088 : "memory"\
3178ee4c
MN
1089 );\
1090 \
1091 temp_ptr= temp;\
1092 count=4;\
1093 \
1094/*FIXME reorder for speed */\
3178ee4c 1095 asm volatile(\
bb270c08
DB
1096 /*"pxor %%mm7, %%mm7 \n\t"*/\
1097 "1: \n\t"\
1098 "movq (%0), %%mm0 \n\t"\
1099 "movq 8(%0), %%mm1 \n\t"\
1100 "movq 16(%0), %%mm2 \n\t"\
1101 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1102 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1103 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
bb270c08 1104 "add %4, %1 \n\t"\
c296f66b 1105 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
3178ee4c 1106 \
c296f66b 1107 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
bb270c08 1108 "add %4, %1 \n\t"\
c296f66b
MN
1109 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1110 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
bb270c08 1111 "add %4, %1 \n\t"\
c296f66b
MN
1112 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1113 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
bb270c08 1114 "add %4, %1 \n\t"\
c296f66b
MN
1115 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1116 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
bb270c08 1117 "add %4, %1 \n\t"\
c296f66b
MN
1118 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1119 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
bb270c08 1120 "add %4, %1 \n\t"\
c296f66b 1121 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
3178ee4c 1122 \
c296f66b 1123 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
bb270c08 1124 "add %4, %1 \n\t" \
c296f66b
MN
1125 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1126 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
3178ee4c 1127 \
bb270c08
DB
1128 "add $136, %0 \n\t"\
1129 "add %6, %1 \n\t"\
1130 "decl %2 \n\t"\
1131 " jnz 1b \n\t"\
3178ee4c 1132 \
5a508a98 1133 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
053dea12 1134 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
0b093b6f 1135 :"memory"\
3178ee4c 1136 );\
826f429a
MN
1137}\
1138\
5c91a675 1139static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
437525c4 1140 uint64_t temp[9*2];\
826f429a
MN
1141 uint64_t *temp_ptr= temp;\
1142 int count= 9;\
1143\
1144 /*FIXME unroll */\
1145 asm volatile(\
bb270c08
DB
1146 "pxor %%mm7, %%mm7 \n\t"\
1147 "1: \n\t"\
1148 "movq (%0), %%mm0 \n\t"\
1149 "movq (%0), %%mm1 \n\t"\
1150 "punpcklbw %%mm7, %%mm0 \n\t"\
1151 "punpckhbw %%mm7, %%mm1 \n\t"\
1152 "movq %%mm0, (%1) \n\t"\
1153 "movq %%mm1, 9*8(%1) \n\t"\
1154 "add $8, %1 \n\t"\
1155 "add %3, %0 \n\t"\
1156 "decl %2 \n\t"\
1157 " jnz 1b \n\t"\
826f429a 1158 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
053dea12 1159 : "r" ((long)srcStride)\
0b093b6f 1160 : "memory"\
826f429a
MN
1161 );\
1162 \
1163 temp_ptr= temp;\
1164 count=2;\
1165 \
1166/*FIXME reorder for speed */\
1167 asm volatile(\
bb270c08
DB
1168 /*"pxor %%mm7, %%mm7 \n\t"*/\
1169 "1: \n\t"\
1170 "movq (%0), %%mm0 \n\t"\
1171 "movq 8(%0), %%mm1 \n\t"\
1172 "movq 16(%0), %%mm2 \n\t"\
1173 "movq 24(%0), %%mm3 \n\t"\
c296f66b
MN
1174 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1175 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
bb270c08 1176 "add %4, %1 \n\t"\
c296f66b 1177 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
826f429a 1178 \
c296f66b 1179 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
bb270c08 1180 "add %4, %1 \n\t"\
c296f66b 1181 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
826f429a 1182 \
c296f66b 1183 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
bb270c08 1184 "add %4, %1 \n\t"\
c296f66b
MN
1185 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1186 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
826f429a 1187 \
bb270c08
DB
1188 "add $72, %0 \n\t"\
1189 "add %6, %1 \n\t"\
1190 "decl %2 \n\t"\
1191 " jnz 1b \n\t"\
826f429a 1192 \
c296f66b 1193 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
053dea12 1194 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
0b093b6f
MN
1195 : "memory"\
1196 );\
3178ee4c 1197}\
826f429a 1198\
0c1a9eda 1199static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6c01d006 1200 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
826f429a
MN
1201}\
1202\
0c1a9eda 1203static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1204 uint64_t temp[8];\
826f429a
MN
1205 uint8_t * const half= (uint8_t*)temp;\
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
d6af6b03 1207 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
826f429a
MN
1208}\
1209\
0c1a9eda 1210static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1211 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1212}\
1213\
0c1a9eda 1214static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1215 uint64_t temp[8];\
826f429a
MN
1216 uint8_t * const half= (uint8_t*)temp;\
1217 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
d6af6b03 1218 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
826f429a
MN
1219}\
1220\
0c1a9eda 1221static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1222 uint64_t temp[8];\
826f429a 1223 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1224 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
d6af6b03 1225 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
826f429a
MN
1226}\
1227\
0c1a9eda 1228static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1229 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1230}\
1231\
0c1a9eda 1232static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1233 uint64_t temp[8];\
826f429a 1234 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1235 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
d6af6b03 1236 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
826f429a 1237}\
0c1a9eda 1238static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1239 uint64_t half[8 + 9];\
1240 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1241 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1242 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1243 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1244 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1245 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1246}\
0c1a9eda 1247static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1248 uint64_t half[8 + 9];\
1249 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1250 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1251 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1252 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1253 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1254 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1255}\
0c1a9eda 1256static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1257 uint64_t half[8 + 9];\
1258 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1259 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1260 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1261 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
3178ee4c 1262 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1263 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1264}\
0c1a9eda 1265static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1266 uint64_t half[8 + 9];\
1267 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1268 uint8_t * const halfHV= ((uint8_t*)half);\
1269 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1270 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
3178ee4c 1271 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1272 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1273}\
0c1a9eda 1274static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1275 uint64_t half[8 + 9];\
826f429a
MN
1276 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1277 uint8_t * const halfHV= ((uint8_t*)half);\
1278 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1279 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1280 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
826f429a 1281}\
0c1a9eda 1282static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1283 uint64_t half[8 + 9];\
826f429a
MN
1284 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1285 uint8_t * const halfHV= ((uint8_t*)half);\
1286 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1287 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
d6af6b03 1288 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
826f429a 1289}\
0c1a9eda 1290static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1291 uint64_t half[8 + 9];\
1292 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1294 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
db794953 1295 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1296}\
0c1a9eda 1297static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1298 uint64_t half[8 + 9];\
1299 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
d6af6b03 1301 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
db794953 1302 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1303}\
0c1a9eda 1304static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953 1305 uint64_t half[9];\
826f429a
MN
1306 uint8_t * const halfH= ((uint8_t*)half);\
1307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
3178ee4c 1308 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
826f429a 1309}\
0c1a9eda 1310static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6c01d006 1311 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
826f429a
MN
1312}\
1313\
0c1a9eda 1314static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1315 uint64_t temp[32];\
1316 uint8_t * const half= (uint8_t*)temp;\
1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
d6af6b03 1318 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
826f429a
MN
1319}\
1320\
0c1a9eda 1321static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1322 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1323}\
1324\
0c1a9eda 1325static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1326 uint64_t temp[32];\
1327 uint8_t * const half= (uint8_t*)temp;\
1328 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
d6af6b03 1329 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
826f429a
MN
1330}\
1331\
0c1a9eda 1332static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1333 uint64_t temp[32];\
1334 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1335 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
d6af6b03 1336 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
826f429a
MN
1337}\
1338\
0c1a9eda 1339static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
3178ee4c 1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
826f429a
MN
1341}\
1342\
0c1a9eda 1343static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1344 uint64_t temp[32];\
1345 uint8_t * const half= (uint8_t*)temp;\
3178ee4c 1346 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
d6af6b03 1347 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
826f429a 1348}\
0c1a9eda 1349static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1350 uint64_t half[16*2 + 17*2];\
1351 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1352 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1353 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1354 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1355 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1356 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1357}\
0c1a9eda 1358static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1359 uint64_t half[16*2 + 17*2];\
1360 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1361 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1362 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1363 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1364 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1365 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1366}\
0c1a9eda 1367static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1368 uint64_t half[16*2 + 17*2];\
1369 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1370 uint8_t * const halfHV= ((uint8_t*)half);\
826f429a 1371 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1372 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
3178ee4c 1373 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1374 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1375}\
0c1a9eda 1376static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1377 uint64_t half[16*2 + 17*2];\
1378 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1379 uint8_t * const halfHV= ((uint8_t*)half);\
1380 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1381 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
3178ee4c 1382 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1383 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1384}\
0c1a9eda 1385static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1386 uint64_t half[16*2 + 17*2];\
1387 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1388 uint8_t * const halfHV= ((uint8_t*)half);\
1389 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1390 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1391 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
826f429a 1392}\
0c1a9eda 1393static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1394 uint64_t half[16*2 + 17*2];\
1395 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1396 uint8_t * const halfHV= ((uint8_t*)half);\
1397 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1398 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
d6af6b03 1399 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
826f429a 1400}\
0c1a9eda 1401static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1402 uint64_t half[17*2];\
1403 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1404 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1405 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
db794953 1406 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1407}\
0c1a9eda 1408static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
db794953
MN
1409 uint64_t half[17*2];\
1410 uint8_t * const halfH= ((uint8_t*)half);\
826f429a 1411 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
d6af6b03 1412 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
db794953 1413 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a 1414}\
0c1a9eda 1415static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
826f429a
MN
1416 uint64_t half[17*2];\
1417 uint8_t * const halfH= ((uint8_t*)half);\
1418 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
3178ee4c 1419 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
826f429a
MN
1420}
1421
bb270c08 1422#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
3178ee4c 1423#define AVG_3DNOW_OP(a,b,temp, size) \
bb270c08
DB
1424"mov" #size " " #b ", " #temp " \n\t"\
1425"pavgusb " #temp ", " #a " \n\t"\
1426"mov" #size " " #a ", " #b " \n\t"
3178ee4c 1427#define AVG_MMX2_OP(a,b,temp, size) \
bb270c08
DB
1428"mov" #size " " #b ", " #temp " \n\t"\
1429"pavgb " #temp ", " #a " \n\t"\
1430"mov" #size " " #a ", " #b " \n\t"
3178ee4c
MN
1431
1432QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1433QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1434QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1435QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1436QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1437QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
826f429a 1438QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
3178ee4c 1439QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
826f429a
MN
1440QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1441
2833fc46
LM
1442/***********************************/
1443/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1444
1445#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1446static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1447 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1448}
1449#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1450static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1451 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1452}
1453
1454#define QPEL_2TAP(OPNAME, SIZE, MMX)\
1455QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1456QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1457QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1458static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1459 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1460static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1461 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1462static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1463 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1464static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1465 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1466}\
1467static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1468 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1469}\
1470QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1471QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1472QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1473QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1474QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1475QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1476QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1477QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1478
1479QPEL_2TAP(put_, 16, mmx2)
1480QPEL_2TAP(avg_, 16, mmx2)
1481QPEL_2TAP(put_, 8, mmx2)
1482QPEL_2TAP(avg_, 8, mmx2)
1483QPEL_2TAP(put_, 16, 3dnow)
1484QPEL_2TAP(avg_, 16, 3dnow)
1485QPEL_2TAP(put_, 8, 3dnow)
1486QPEL_2TAP(avg_, 8, 3dnow)
1487
1488
61a4e8ae 1489#if 0
d6a4c0b1 1490static void just_return() { return; }
61a4e8ae 1491#endif
d6a4c0b1 1492
703c8195
LM
1493static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1494 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
1495 const int w = 8;
703c8195
LM
1496 const int ix = ox>>(16+shift);
1497 const int iy = oy>>(16+shift);
1498 const int oxs = ox>>4;
1499 const int oys = oy>>4;
1500 const int dxxs = dxx>>4;
1501 const int dxys = dxy>>4;
1502 const int dyxs = dyx>>4;
1503 const int dyys = dyy>>4;
1504 const uint16_t r4[4] = {r,r,r,r};
1505 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1506 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1507 const uint64_t shift2 = 2*shift;
1508 uint8_t edge_buf[(h+1)*stride];
1509 int x, y;
1510
1511 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1512 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1513 const int dxh = dxy*(h-1);
1514 const int dyw = dyx*(w-1);
1515 if( // non-constant fullpel offset (3% of blocks)
066e0cc5
BC
1516 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1517 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
703c8195
LM
1518 // uses more than 16 bits of subpel mv (only at huge resolution)
1519 || (dxx|dxy|dyx|dyy)&15 )
1520 {
1521 //FIXME could still use mmx for some of the rows
1522 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1523 return;
1524 }
1525
75ca1a5f 1526 src += ix + iy*stride;
703c8195
LM
1527 if( (unsigned)ix >= width-w ||
1528 (unsigned)iy >= height-h )
1529 {
75ca1a5f 1530 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
703c8195
LM
1531 src = edge_buf;
1532 }
75ca1a5f
LM
1533
1534 asm volatile(
1535 "movd %0, %%mm6 \n\t"
1536 "pxor %%mm7, %%mm7 \n\t"
1537 "punpcklwd %%mm6, %%mm6 \n\t"
1538 "punpcklwd %%mm6, %%mm6 \n\t"
1539 :: "r"(1<<shift)
1540 );
703c8195
LM
1541
1542 for(x=0; x<w; x+=4){
1543 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1544 oxs - dxys + dxxs*(x+1),
1545 oxs - dxys + dxxs*(x+2),
1546 oxs - dxys + dxxs*(x+3) };
1547 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1548 oys - dyys + dyxs*(x+1),
1549 oys - dyys + dyxs*(x+2),
1550 oys - dyys + dyxs*(x+3) };
1551
703c8195
LM
1552 for(y=0; y<h; y++){
1553 asm volatile(
1554 "movq %0, %%mm4 \n\t"
1555 "movq %1, %%mm5 \n\t"
1556 "paddw %2, %%mm4 \n\t"
1557 "paddw %3, %%mm5 \n\t"
1558 "movq %%mm4, %0 \n\t"
1559 "movq %%mm5, %1 \n\t"
1560 "psrlw $12, %%mm4 \n\t"
1561 "psrlw $12, %%mm5 \n\t"
1562 : "+m"(*dx4), "+m"(*dy4)
1563 : "m"(*dxy4), "m"(*dyy4)
1564 );
1565
1566 asm volatile(
1567 "movq %%mm6, %%mm2 \n\t"
1568 "movq %%mm6, %%mm1 \n\t"
1569 "psubw %%mm4, %%mm2 \n\t"
1570 "psubw %%mm5, %%mm1 \n\t"
1571 "movq %%mm2, %%mm0 \n\t"
1572 "movq %%mm4, %%mm3 \n\t"
1573 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1574 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1575 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1576 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1577
1578 "movd %4, %%mm5 \n\t"
1579 "movd %3, %%mm4 \n\t"
1580 "punpcklbw %%mm7, %%mm5 \n\t"
1581 "punpcklbw %%mm7, %%mm4 \n\t"
1582 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1583 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1584
1585 "movd %2, %%mm5 \n\t"
1586 "movd %1, %%mm4 \n\t"
1587 "punpcklbw %%mm7, %%mm5 \n\t"
1588 "punpcklbw %%mm7, %%mm4 \n\t"
1589 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1590 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
75ca1a5f 1591 "paddw %5, %%mm1 \n\t"
703c8195
LM
1592 "paddw %%mm3, %%mm2 \n\t"
1593 "paddw %%mm1, %%mm0 \n\t"
1594 "paddw %%mm2, %%mm0 \n\t"
703c8195
LM
1595
1596 "psrlw %6, %%mm0 \n\t"
1597 "packuswb %%mm0, %%mm0 \n\t"
1598 "movd %%mm0, %0 \n\t"
1599
1600 : "=m"(dst[x+y*stride])
1601 : "m"(src[0]), "m"(src[1]),
1602 "m"(src[stride]), "m"(src[stride+1]),
1603 "m"(*r4), "m"(shift2)
1604 );
1605 src += stride;
1606 }
1607 src += 4-h*stride;
1608 }
1609}
1610
513fbd8e 1611#define PREFETCH(name, op) \
bb54f6ab 1612static void name(void *mem, int stride, int h){\
513fbd8e
LM
1613 const uint8_t *p= mem;\
1614 do{\
1615 asm volatile(#op" %0" :: "m"(*p));\
1616 p+= stride;\
1617 }while(--h);\
1618}
1619PREFETCH(prefetch_mmx2, prefetcht0)
1620PREFETCH(prefetch_3dnow, prefetch)
1621#undef PREFETCH
1622
d2bb7db1 1623#include "h264dsp_mmx.c"
115329f1 1624
571bf37f 1625/* CAVS specific */
595e7bd9
SG
1626void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
1627
1628void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1629 put_pixels8_mmx(dst, src, stride, 8);
1630}
1631void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1632 avg_pixels8_mmx(dst, src, stride, 8);
1633}
1634void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1635 put_pixels16_mmx(dst, src, stride, 16);
1636}
1637void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1638 avg_pixels16_mmx(dst, src, stride, 16);
1639}
1640
5b67ce2a
AJ
1641/* VC1 specific */
1642void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
1643
1644void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1645 put_pixels8_mmx(dst, src, stride, 8);
1646}
1647
b0368839
MN
1648/* external functions, from idct_mmx.c */
1649void ff_mmx_idct(DCTELEM *block);
1650void ff_mmxext_idct(DCTELEM *block);
1651
1652/* XXX: those functions should be suppressed ASAP when all IDCTs are
1653 converted */
7c428ea6 1654#ifdef CONFIG_GPL
b0368839
MN
1655static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1656{
1657 ff_mmx_idct (block);
1658 put_pixels_clamped_mmx(block, dest, line_size);
1659}
1660static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1661{
1662 ff_mmx_idct (block);
1663 add_pixels_clamped_mmx(block, dest, line_size);
1664}
1665static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1666{
1667 ff_mmxext_idct (block);
1668 put_pixels_clamped_mmx(block, dest, line_size);
1669}
1670static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1671{
1672 ff_mmxext_idct (block);
1673 add_pixels_clamped_mmx(block, dest, line_size);
1674}
7c428ea6 1675#endif
84740d59
MN
1676static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1677{
1678 ff_idct_xvid_mmx (block);
1679 put_pixels_clamped_mmx(block, dest, line_size);
1680}
1681static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1682{
1683 ff_idct_xvid_mmx (block);
1684 add_pixels_clamped_mmx(block, dest, line_size);
1685}
1686static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1687{
1688 ff_idct_xvid_mmx2 (block);
1689 put_pixels_clamped_mmx(block, dest, line_size);
1690}
1691static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1692{
1693 ff_idct_xvid_mmx2 (block);
1694 add_pixels_clamped_mmx(block, dest, line_size);
1695}
115329f1 1696
cd035a60 1697static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2dac4acf
LM
1698{
1699 int i;
1700 asm volatile("pxor %%mm7, %%mm7":);
1701 for(i=0; i<blocksize; i+=2) {
1702 asm volatile(
1703 "movq %0, %%mm0 \n\t"
1704 "movq %1, %%mm1 \n\t"
1705 "movq %%mm0, %%mm2 \n\t"
1706 "movq %%mm1, %%mm3 \n\t"
1707 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
1708 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
1709 "pslld $31, %%mm2 \n\t" // keep only the sign bit
1710 "pxor %%mm2, %%mm1 \n\t"
1711 "movq %%mm3, %%mm4 \n\t"
1712 "pand %%mm1, %%mm3 \n\t"
1713 "pandn %%mm1, %%mm4 \n\t"
1714 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1715 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1716 "movq %%mm3, %1 \n\t"
1717 "movq %%mm0, %0 \n\t"
1718 :"+m"(mag[i]), "+m"(ang[i])
1719 ::"memory"
1720 );
1721 }
ee5df927 1722 asm volatile("femms");
2dac4acf 1723}
21bb884f 1724static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2dac4acf
LM
1725{
1726 int i;
21bb884f
MN
1727
1728 asm volatile(
1729 "movaps %0, %%xmm5 \n\t"
1730 ::"m"(ff_pdw_80000000[0])
1731 );
2dac4acf
LM
1732 for(i=0; i<blocksize; i+=4) {
1733 asm volatile(
1734 "movaps %0, %%xmm0 \n\t"
1735 "movaps %1, %%xmm1 \n\t"
21bb884f
MN
1736 "xorps %%xmm2, %%xmm2 \n\t"
1737 "xorps %%xmm3, %%xmm3 \n\t"
2dac4acf
LM
1738 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
1739 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
21bb884f
MN
1740 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
1741 "xorps %%xmm2, %%xmm1 \n\t"
2dac4acf 1742 "movaps %%xmm3, %%xmm4 \n\t"
21bb884f
MN
1743 "andps %%xmm1, %%xmm3 \n\t"
1744 "andnps %%xmm1, %%xmm4 \n\t"
2dac4acf
LM
1745 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1746 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1747 "movaps %%xmm3, %1 \n\t"
1748 "movaps %%xmm0, %0 \n\t"
1749 :"+m"(mag[i]), "+m"(ang[i])
1750 ::"memory"
1751 );
1752 }
1753}
1754
eb4825b5 1755static void vector_fmul_3dnow(float *dst, const float *src, int len){
06972056
LM
1756 long i = (len-4)*4;
1757 asm volatile(
1758 "1: \n\t"
1759 "movq (%1,%0), %%mm0 \n\t"
1760 "movq 8(%1,%0), %%mm1 \n\t"
1761 "pfmul (%2,%0), %%mm0 \n\t"
1762 "pfmul 8(%2,%0), %%mm1 \n\t"
1763 "movq %%mm0, (%1,%0) \n\t"
1764 "movq %%mm1, 8(%1,%0) \n\t"
1765 "sub $16, %0 \n\t"
1766 "jge 1b \n\t"
1767 "femms \n\t"
1768 :"+r"(i)
1769 :"r"(dst), "r"(src)
1770 :"memory"
1771 );
eb4825b5
LM
1772}
1773static void vector_fmul_sse(float *dst, const float *src, int len){
06972056
LM
1774 long i = (len-8)*4;
1775 asm volatile(
1776 "1: \n\t"
1777 "movaps (%1,%0), %%xmm0 \n\t"
1778 "movaps 16(%1,%0), %%xmm1 \n\t"
1779 "mulps (%2,%0), %%xmm0 \n\t"
1780 "mulps 16(%2,%0), %%xmm1 \n\t"
1781 "movaps %%xmm0, (%1,%0) \n\t"
1782 "movaps %%xmm1, 16(%1,%0) \n\t"
1783 "sub $32, %0 \n\t"
1784 "jge 1b \n\t"
1785 :"+r"(i)
1786 :"r"(dst), "r"(src)
1787 :"memory"
1788 );
eb4825b5
LM
1789}
1790
1791static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
1792 long i = len*4-16;
1793 asm volatile(
1794 "1: \n\t"
1795 "pswapd 8(%1), %%mm0 \n\t"
1796 "pswapd (%1), %%mm1 \n\t"
1797 "pfmul (%3,%0), %%mm0 \n\t"
1798 "pfmul 8(%3,%0), %%mm1 \n\t"
1799 "movq %%mm0, (%2,%0) \n\t"
1800 "movq %%mm1, 8(%2,%0) \n\t"
1801 "add $16, %1 \n\t"
1802 "sub $16, %0 \n\t"
1803 "jge 1b \n\t"
1804 :"+r"(i), "+r"(src1)
1805 :"r"(dst), "r"(src0)
1806 );
1807 asm volatile("femms");
1808}
1f1aa1d9 1809static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
eb4825b5
LM
1810 long i = len*4-32;
1811 asm volatile(
1812 "1: \n\t"
1f1aa1d9
MN
1813 "movaps 16(%1), %%xmm0 \n\t"
1814 "movaps (%1), %%xmm1 \n\t"
1815 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
1816 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
eb4825b5
LM
1817 "mulps (%3,%0), %%xmm0 \n\t"
1818 "mulps 16(%3,%0), %%xmm1 \n\t"
1819 "movaps %%xmm0, (%2,%0) \n\t"
1820 "movaps %%xmm1, 16(%2,%0) \n\t"
1821 "add $32, %1 \n\t"
1822 "sub $32, %0 \n\t"
1823 "jge 1b \n\t"
1824 :"+r"(i), "+r"(src1)
1825 :"r"(dst), "r"(src0)
1826 );
1827}
1828
1829static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
1830 const float *src2, int src3, int len, int step){
06972056 1831 long i = (len-4)*4;
eb4825b5 1832 if(step == 2 && src3 == 0){
eb4825b5
LM
1833 dst += (len-4)*2;
1834 asm volatile(
1835 "1: \n\t"
1836 "movq (%2,%0), %%mm0 \n\t"
1837 "movq 8(%2,%0), %%mm1 \n\t"
1838 "pfmul (%3,%0), %%mm0 \n\t"
1839 "pfmul 8(%3,%0), %%mm1 \n\t"
1840 "pfadd (%4,%0), %%mm0 \n\t"
1841 "pfadd 8(%4,%0), %%mm1 \n\t"
1842 "movd %%mm0, (%1) \n\t"
1843 "movd %%mm1, 16(%1) \n\t"
1844 "psrlq $32, %%mm0 \n\t"
1845 "psrlq $32, %%mm1 \n\t"
1846 "movd %%mm0, 8(%1) \n\t"
1847 "movd %%mm1, 24(%1) \n\t"
1848 "sub $32, %1 \n\t"
1849 "sub $16, %0 \n\t"
1850 "jge 1b \n\t"
1851 :"+r"(i), "+r"(dst)
1852 :"r"(src0), "r"(src1), "r"(src2)
1853 :"memory"
1854 );
1855 }
1856 else if(step == 1 && src3 == 0){
06972056
LM
1857 asm volatile(
1858 "1: \n\t"
1859 "movq (%2,%0), %%mm0 \n\t"
1860 "movq 8(%2,%0), %%mm1 \n\t"
1861 "pfmul (%3,%0), %%mm0 \n\t"
1862 "pfmul 8(%3,%0), %%mm1 \n\t"
1863 "pfadd (%4,%0), %%mm0 \n\t"
1864 "pfadd 8(%4,%0), %%mm1 \n\t"
1865 "movq %%mm0, (%1,%0) \n\t"
1866 "movq %%mm1, 8(%1,%0) \n\t"
1867 "sub $16, %0 \n\t"
1868 "jge 1b \n\t"
1869 :"+r"(i)
1870 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
1871 :"memory"
1872 );
eb4825b5
LM
1873 }
1874 else
1875 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
1876 asm volatile("femms");
1877}
1f1aa1d9 1878static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
06972056
LM
1879 const float *src2, int src3, int len, int step){
1880 long i = (len-8)*4;
eb4825b5 1881 if(step == 2 && src3 == 0){
eb4825b5
LM
1882 dst += (len-8)*2;
1883 asm volatile(
1884 "1: \n\t"
1885 "movaps (%2,%0), %%xmm0 \n\t"
1886 "movaps 16(%2,%0), %%xmm1 \n\t"
1887 "mulps (%3,%0), %%xmm0 \n\t"
1888 "mulps 16(%3,%0), %%xmm1 \n\t"
1889 "addps (%4,%0), %%xmm0 \n\t"
1890 "addps 16(%4,%0), %%xmm1 \n\t"
1f1aa1d9
MN
1891 "movss %%xmm0, (%1) \n\t"
1892 "movss %%xmm1, 32(%1) \n\t"
1893 "movhlps %%xmm0, %%xmm2 \n\t"
1894 "movhlps %%xmm1, %%xmm3 \n\t"
1895 "movss %%xmm2, 16(%1) \n\t"
1896 "movss %%xmm3, 48(%1) \n\t"
1897 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
1898 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
1899 "movss %%xmm0, 8(%1) \n\t"
1900 "movss %%xmm1, 40(%1) \n\t"
1901 "movhlps %%xmm0, %%xmm2 \n\t"
1902 "movhlps %%xmm1, %%xmm3 \n\t"
1903 "movss %%xmm2, 24(%1) \n\t"
1904 "movss %%xmm3, 56(%1) \n\t"
eb4825b5
LM
1905 "sub $64, %1 \n\t"
1906 "sub $32, %0 \n\t"
1907 "jge 1b \n\t"
1908 :"+r"(i), "+r"(dst)
1909 :"r"(src0), "r"(src1), "r"(src2)
1910 :"memory"
1911 );
1912 }
1913 else if(step == 1 && src3 == 0){
06972056
LM
1914 asm volatile(
1915 "1: \n\t"
1916 "movaps (%2,%0), %%xmm0 \n\t"
1917 "movaps 16(%2,%0), %%xmm1 \n\t"
1918 "mulps (%3,%0), %%xmm0 \n\t"
1919 "mulps 16(%3,%0), %%xmm1 \n\t"
1920 "addps (%4,%0), %%xmm0 \n\t"
1921 "addps 16(%4,%0), %%xmm1 \n\t"
1922 "movaps %%xmm0, (%1,%0) \n\t"
1923 "movaps %%xmm1, 16(%1,%0) \n\t"
1924 "sub $32, %0 \n\t"
1925 "jge 1b \n\t"
1926 :"+r"(i)
1927 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
1928 :"memory"
1929 );
eb4825b5
LM
1930 }
1931 else
1932 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
1933}
1934
bb54f6ab 1935static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
eb4825b5
LM
1936 // not bit-exact: pf2id uses different rounding than C and SSE
1937 int i;
1938 for(i=0; i<len; i+=4) {
1939 asm volatile(
1940 "pf2id %1, %%mm0 \n\t"
1941 "pf2id %2, %%mm1 \n\t"
1942 "packssdw %%mm1, %%mm0 \n\t"
1943 "movq %%mm0, %0 \n\t"
1944 :"=m"(dst[i])
1945 :"m"(src[i]), "m"(src[i+2])
1946 );
1947 }
1948 asm volatile("femms");
1949}
bb54f6ab 1950static void float_to_int16_sse(int16_t *dst, const float *src, int len){
eb4825b5
LM
1951 int i;
1952 for(i=0; i<len; i+=4) {
1953 asm volatile(
1954 "cvtps2pi %1, %%mm0 \n\t"
1955 "cvtps2pi %2, %%mm1 \n\t"
1956 "packssdw %%mm1, %%mm0 \n\t"
1957 "movq %%mm0, %0 \n\t"
1958 :"=m"(dst[i])
1959 :"m"(src[i]), "m"(src[i+2])
1960 );
1961 }
1962 asm volatile("emms");
1963}
1964
afa47789
BC
1965extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
1966extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
1967extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
1968extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
9dd6c804 1969extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
e8600e5e 1970 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
9dd6c804 1971extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
e8600e5e 1972 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4567b4bd 1973
b0368839 1974void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
de6d9b64 1975{
486497e0 1976 mm_flags = mm_support();
63f60826 1977
e5247b5f 1978 if (avctx->dsp_mask) {
bb270c08 1979 if (avctx->dsp_mask & FF_MM_FORCE)
486497e0 1980 mm_flags |= (avctx->dsp_mask & 0xffff);
bb270c08 1981 else
486497e0 1982 mm_flags &= ~(avctx->dsp_mask & 0xffff);
e5247b5f 1983 }
63f60826 1984
1565dabc 1985#if 0
01456e8e 1986 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
486497e0 1987 if (mm_flags & MM_MMX)
01456e8e 1988 av_log(avctx, AV_LOG_INFO, " mmx");
486497e0 1989 if (mm_flags & MM_MMXEXT)
01456e8e 1990 av_log(avctx, AV_LOG_INFO, " mmxext");
486497e0 1991 if (mm_flags & MM_3DNOW)
01456e8e 1992 av_log(avctx, AV_LOG_INFO, " 3dnow");
486497e0 1993 if (mm_flags & MM_SSE)
01456e8e 1994 av_log(avctx, AV_LOG_INFO, " sse");
486497e0 1995 if (mm_flags & MM_SSE2)
01456e8e
MM
1996 av_log(avctx, AV_LOG_INFO, " sse2");
1997 av_log(avctx, AV_LOG_INFO, "\n");
de6d9b64
FB
1998#endif
1999
486497e0 2000 if (mm_flags & MM_MMX) {
b0368839
MN
2001 const int idct_algo= avctx->idct_algo;
2002
178fcca8
MN
2003 if(avctx->lowres==0){
2004 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2005 c->idct_put= ff_simple_idct_put_mmx;
2006 c->idct_add= ff_simple_idct_add_mmx;
2007 c->idct = ff_simple_idct_mmx;
2008 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
7f889a76 2009#ifdef CONFIG_GPL
178fcca8 2010 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
486497e0 2011 if(mm_flags & MM_MMXEXT){
178fcca8
MN
2012 c->idct_put= ff_libmpeg2mmx2_idct_put;
2013 c->idct_add= ff_libmpeg2mmx2_idct_add;
2014 c->idct = ff_mmxext_idct;
2015 }else{
2016 c->idct_put= ff_libmpeg2mmx_idct_put;
2017 c->idct_add= ff_libmpeg2mmx_idct_add;
2018 c->idct = ff_mmx_idct;
2019 }
2020 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
7f889a76 2021#endif
9b5dc867
PI
2022 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
2023 idct_algo==FF_IDCT_VP3 &&
2a2311be 2024 avctx->codec->id!=CODEC_ID_THEORA &&
1dac8fea 2025 !(avctx->flags & CODEC_FLAG_BITEXACT)){
486497e0 2026 if(mm_flags & MM_SSE2){
5773a746
MN
2027 c->idct_put= ff_vp3_idct_put_sse2;
2028 c->idct_add= ff_vp3_idct_add_sse2;
2029 c->idct = ff_vp3_idct_sse2;
2030 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2031 }else{
2032 ff_vp3_dsp_init_mmx();
2033 c->idct_put= ff_vp3_idct_put_mmx;
2034 c->idct_add= ff_vp3_idct_add_mmx;
2035 c->idct = ff_vp3_idct_mmx;
2036 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2037 }
595e7bd9
SG
2038 }else if(idct_algo==FF_IDCT_CAVS){
2039 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
84740d59 2040 }else if(idct_algo==FF_IDCT_XVIDMMX){
486497e0 2041 if(mm_flags & MM_MMXEXT){
84740d59
MN
2042 c->idct_put= ff_idct_xvid_mmx2_put;
2043 c->idct_add= ff_idct_xvid_mmx2_add;
2044 c->idct = ff_idct_xvid_mmx2;
2045 }else{
2046 c->idct_put= ff_idct_xvid_mmx_put;
2047 c->idct_add= ff_idct_xvid_mmx_add;
2048 c->idct = ff_idct_xvid_mmx;
2049 }
cd7af76d 2050 }
b0368839 2051 }
01456e8e 2052
eb4b3dd3 2053 c->put_pixels_clamped = put_pixels_clamped_mmx;
f9ed9d85 2054 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
eb4b3dd3
ZK
2055 c->add_pixels_clamped = add_pixels_clamped_mmx;
2056 c->clear_blocks = clear_blocks_mmx;
eb4b3dd3 2057
b2f77586
LM
2058#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2059 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2060 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2061 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2062 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2063
2064 SET_HPEL_FUNCS(put, 0, 16, mmx);
2065 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2066 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2067 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2068 SET_HPEL_FUNCS(put, 1, 8, mmx);
2069 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2070 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2071 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
115329f1 2072
703c8195
LM
2073 c->gmc= gmc_mmx;
2074
11f18faf 2075 c->add_bytes= add_bytes_mmx;
4a9ca0a2 2076 c->add_bytes_l2= add_bytes_l2_mmx;
359f98de 2077
eb75a698 2078 if (ENABLE_ANY_H263) {
674eeb5f
AJ
2079 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2080 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
eb75a698 2081 }
9fa35729 2082 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
a6624e21 2083 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
9fa35729 2084 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
115329f1 2085
6da971f1
LM
2086 c->h264_idct_dc_add=
2087 c->h264_idct_add= ff_h264_idct_add_mmx;
548a1c8a
LM
2088 c->h264_idct8_dc_add=
2089 c->h264_idct8_add= ff_h264_idct8_add_mmx;
ed5d7a53
LM
2090 if (mm_flags & MM_SSE2)
2091 c->h264_idct8_add= ff_h264_idct8_add_sse2;
6da971f1 2092
486497e0 2093 if (mm_flags & MM_MMXEXT) {
513fbd8e
LM
2094 c->prefetch = prefetch_mmx2;
2095
eb4b3dd3
ZK
2096 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2097 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
eb4b3dd3
ZK
2098
2099 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2100 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2101 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
eb4b3dd3
ZK
2102
2103 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2104 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
eb4b3dd3
ZK
2105
2106 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2107 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2108 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
b0368839 2109
ef9d1d15
LM
2110 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2111 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
1d62fc85 2112
b0368839
MN
2113 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2114 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2115 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2116 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2117 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2118 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2119 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2120 }
3178ee4c 2121
b2f77586
LM
2122#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2123 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2124 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2125 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2126 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2127 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2128 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2129 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2130 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2131 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2132 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2133 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2134 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2135 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2136 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2137 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2138 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2139
2140 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2141 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2142 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2143 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2144 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2145 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2146
2147 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2148 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2149 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2150 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2151 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2152 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2153
2154 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2155 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2156 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2157 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
437525c4 2158
9fa35729 2159 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
a6624e21 2160 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
fdd30579
LM
2161 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
2162 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
42251a2a
LM
2163 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
2164 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
2165 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
2166 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
5cf08f23
LM
2167 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
2168 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3e20143e 2169 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
42251a2a 2170
b926572a
LM
2171 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
2172 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
2173 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
2174 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
2175 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
2176 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
2177 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
2178 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
2179
2180 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
2181 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
2182 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
2183 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
2184 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
2185 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
2186 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
2187 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
2188
51ac8822 2189 if (ENABLE_CAVS_DECODER)
7c35b551 2190 ff_cavsdsp_init_mmx2(c, avctx);
595e7bd9 2191
51ac8822 2192 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
7c35b551 2193 ff_vc1dsp_init_mmx(c, avctx);
82821c91 2194
4a9ca0a2 2195 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
486497e0 2196 } else if (mm_flags & MM_3DNOW) {
513fbd8e
LM
2197 c->prefetch = prefetch_3dnow;
2198
eb4b3dd3
ZK
2199 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2200 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
eb4b3dd3
ZK
2201
2202 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2203 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2204 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
eb4b3dd3
ZK
2205
2206 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2207 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eb4b3dd3
ZK
2208
2209 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2210 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2211 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
b0368839
MN
2212
2213 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2214 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2215 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2216 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2217 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2218 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2219 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2220 }
db794953 2221
b2f77586
LM
2222 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2223 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2224 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2225 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2226 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2227 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2228
2229 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2230 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2231 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2232 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2233 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2234 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2235
2236 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2237 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2238 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2239 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2833fc46 2240
9fa35729 2241 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
a6624e21 2242 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
de6d9b64 2243 }
4567b4bd 2244
1d67b037
LM
2245
2246#define H264_QPEL_FUNCS(x, y, CPU)\
2247 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2248 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2249 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2250 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
ddf96970
LM
2251 if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){
2252 // these functions are slower than mmx on AMD, but faster on Intel
1d67b037 2253/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
ddf96970
LM
2254 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2255 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
ddf96970 2256*/
1d67b037
LM
2257 H264_QPEL_FUNCS(0, 0, sse2);
2258 }
2259 if(mm_flags & MM_SSE2){
2260 H264_QPEL_FUNCS(0, 1, sse2);
2261 H264_QPEL_FUNCS(0, 2, sse2);
2262 H264_QPEL_FUNCS(0, 3, sse2);
2263 H264_QPEL_FUNCS(1, 1, sse2);
2264 H264_QPEL_FUNCS(1, 2, sse2);
2265 H264_QPEL_FUNCS(1, 3, sse2);
2266 H264_QPEL_FUNCS(2, 1, sse2);
2267 H264_QPEL_FUNCS(2, 2, sse2);
2268 H264_QPEL_FUNCS(2, 3, sse2);
2269 H264_QPEL_FUNCS(3, 1, sse2);
2270 H264_QPEL_FUNCS(3, 2, sse2);
2271 H264_QPEL_FUNCS(3, 3, sse2);
2272 }
ddf96970
LM
2273#ifdef HAVE_SSSE3
2274 if(mm_flags & MM_SSSE3){
1d67b037
LM
2275 H264_QPEL_FUNCS(1, 0, ssse3);
2276 H264_QPEL_FUNCS(1, 1, ssse3);
2277 H264_QPEL_FUNCS(1, 2, ssse3);
2278 H264_QPEL_FUNCS(1, 3, ssse3);
2279 H264_QPEL_FUNCS(2, 0, ssse3);
2280 H264_QPEL_FUNCS(2, 1, ssse3);
2281 H264_QPEL_FUNCS(2, 2, ssse3);
2282 H264_QPEL_FUNCS(2, 3, ssse3);
2283 H264_QPEL_FUNCS(3, 0, ssse3);
2284 H264_QPEL_FUNCS(3, 1, ssse3);
2285 H264_QPEL_FUNCS(3, 2, ssse3);
2286 H264_QPEL_FUNCS(3, 3, ssse3);
4a9ca0a2 2287 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
ddf96970
LM
2288 }
2289#endif
2290
d42f8802 2291#ifdef CONFIG_SNOW_DECODER
3e0f7126 2292 if(mm_flags & MM_SSE2 & 0){
2c9a0285 2293 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
7bcc1d5b 2294#ifdef HAVE_7REGS
4567b4bd 2295 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
90e9e94d 2296#endif
e8600e5e 2297 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
4567b4bd
RE
2298 }
2299 else{
62975029 2300 if(mm_flags & MM_MMXEXT){
2c9a0285 2301 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
7bcc1d5b 2302#ifdef HAVE_7REGS
4567b4bd 2303 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
90e9e94d 2304#endif
62975029 2305 }
e8600e5e 2306 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
4567b4bd
RE
2307 }
2308#endif
2dac4acf 2309
486497e0 2310 if(mm_flags & MM_3DNOW){
cd035a60 2311 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
eb4825b5
LM
2312 c->vector_fmul = vector_fmul_3dnow;
2313 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
2314 c->float_to_int16 = float_to_int16_3dnow;
2315 }
486497e0 2316 if(mm_flags & MM_3DNOWEXT)
eb4825b5 2317 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
486497e0 2318 if(mm_flags & MM_SSE){
eb4825b5
LM
2319 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2320 c->vector_fmul = vector_fmul_sse;
2321 c->float_to_int16 = float_to_int16_sse;
1f1aa1d9
MN
2322 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2323 c->vector_fmul_add_add = vector_fmul_add_add_sse;
eb4825b5 2324 }
486497e0 2325 if(mm_flags & MM_3DNOW)
06972056 2326 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
de6d9b64 2327 }
115329f1 2328
97d1d009
AJ
2329 if (ENABLE_ENCODERS)
2330 dsputilenc_init_mmx(c, avctx);
2331
d6a4c0b1
ZK
2332#if 0
2333 // for speed testing
2334 get_pixels = just_return;
2335 put_pixels_clamped = just_return;
2336 add_pixels_clamped = just_return;
2337
2338 pix_abs16x16 = just_return;
2339 pix_abs16x16_x2 = just_return;
2340 pix_abs16x16_y2 = just_return;
2341 pix_abs16x16_xy2 = just_return;
2342
2343 put_pixels_tab[0] = just_return;
2344 put_pixels_tab[1] = just_return;
2345 put_pixels_tab[2] = just_return;
2346 put_pixels_tab[3] = just_return;
2347
2348 put_no_rnd_pixels_tab[0] = just_return;
2349 put_no_rnd_pixels_tab[1] = just_return;
2350 put_no_rnd_pixels_tab[2] = just_return;
2351 put_no_rnd_pixels_tab[3] = just_return;
2352
2353 avg_pixels_tab[0] = just_return;
2354 avg_pixels_tab[1] = just_return;
2355 avg_pixels_tab[2] = just_return;
2356 avg_pixels_tab[3] = just_return;
2357
2358 avg_no_rnd_pixels_tab[0] = just_return;
2359 avg_no_rnd_pixels_tab[1] = just_return;
2360 avg_no_rnd_pixels_tab[2] = just_return;
2361 avg_no_rnd_pixels_tab[3] = just_return;
2362
d6a4c0b1
ZK
2363 //av_fdct = just_return;
2364 //ff_idct = just_return;
2365#endif
de6d9b64 2366}