x86inc: Drop SECTION_TEXT macro
[libav.git] / libavcodec / x86 / qpeldsp.asm
1 ;******************************************************************************
2 ;* quarterpel DSP functions
3 ;*
4 ;* Copyright (c) 2008 Loren Merritt
5 ;*
6 ;* This file is part of Libav.
7 ;*
8 ;* Libav is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* Libav is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with Libav; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "libavutil/x86/x86util.asm"
24
25 SECTION_RODATA
26 cextern pb_1
27 cextern pw_3
28 cextern pw_15
29 cextern pw_16
30 cextern pw_20
31
32
33 SECTION .text
34
35 ; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
36 %macro PUT_NO_RND_PIXELS8_L2 0
37 cglobal put_no_rnd_pixels8_l2, 6,6
38 movsxdifnidn r4, r4d
39 movsxdifnidn r3, r3d
40 pcmpeqb m6, m6
41 test r5d, 1
42 je .loop
43 mova m0, [r1]
44 mova m1, [r2]
45 add r1, r4
46 add r2, 8
47 pxor m0, m6
48 pxor m1, m6
49 PAVGB m0, m1
50 pxor m0, m6
51 mova [r0], m0
52 add r0, r3
53 dec r5d
54 .loop:
55 mova m0, [r1]
56 add r1, r4
57 mova m1, [r1]
58 add r1, r4
59 mova m2, [r2]
60 mova m3, [r2+8]
61 pxor m0, m6
62 pxor m1, m6
63 pxor m2, m6
64 pxor m3, m6
65 PAVGB m0, m2
66 PAVGB m1, m3
67 pxor m0, m6
68 pxor m1, m6
69 mova [r0], m0
70 add r0, r3
71 mova [r0], m1
72 add r0, r3
73 mova m0, [r1]
74 add r1, r4
75 mova m1, [r1]
76 add r1, r4
77 mova m2, [r2+16]
78 mova m3, [r2+24]
79 pxor m0, m6
80 pxor m1, m6
81 pxor m2, m6
82 pxor m3, m6
83 PAVGB m0, m2
84 PAVGB m1, m3
85 pxor m0, m6
86 pxor m1, m6
87 mova [r0], m0
88 add r0, r3
89 mova [r0], m1
90 add r0, r3
91 add r2, 32
92 sub r5d, 4
93 jne .loop
94 REP_RET
95 %endmacro
96
97 INIT_MMX mmxext
98 PUT_NO_RND_PIXELS8_L2
99
100
101 ; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
102 %macro PUT_NO_RND_PIXELS16_l2 0
103 cglobal put_no_rnd_pixels16_l2, 6,6
104 movsxdifnidn r3, r3d
105 movsxdifnidn r4, r4d
106 pcmpeqb m6, m6
107 test r5d, 1
108 je .loop
109 mova m0, [r1]
110 mova m1, [r1+8]
111 mova m2, [r2]
112 mova m3, [r2+8]
113 pxor m0, m6
114 pxor m1, m6
115 pxor m2, m6
116 pxor m3, m6
117 PAVGB m0, m2
118 PAVGB m1, m3
119 pxor m0, m6
120 pxor m1, m6
121 add r1, r4
122 add r2, 16
123 mova [r0], m0
124 mova [r0+8], m1
125 add r0, r3
126 dec r5d
127 .loop:
128 mova m0, [r1]
129 mova m1, [r1+8]
130 add r1, r4
131 mova m2, [r2]
132 mova m3, [r2+8]
133 pxor m0, m6
134 pxor m1, m6
135 pxor m2, m6
136 pxor m3, m6
137 PAVGB m0, m2
138 PAVGB m1, m3
139 pxor m0, m6
140 pxor m1, m6
141 mova [r0], m0
142 mova [r0+8], m1
143 add r0, r3
144 mova m0, [r1]
145 mova m1, [r1+8]
146 add r1, r4
147 mova m2, [r2+16]
148 mova m3, [r2+24]
149 pxor m0, m6
150 pxor m1, m6
151 pxor m2, m6
152 pxor m3, m6
153 PAVGB m0, m2
154 PAVGB m1, m3
155 pxor m0, m6
156 pxor m1, m6
157 mova [r0], m0
158 mova [r0+8], m1
159 add r0, r3
160 add r2, 32
161 sub r5d, 2
162 jne .loop
163 REP_RET
164 %endmacro
165
166 INIT_MMX mmxext
167 PUT_NO_RND_PIXELS16_l2
168 INIT_MMX 3dnow
169 PUT_NO_RND_PIXELS16_l2
170
171 %macro MPEG4_QPEL16_H_LOWPASS 1
172 cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
173 movsxdifnidn r2, r2d
174 movsxdifnidn r3, r3d
175 pxor m7, m7
176 .loop:
177 mova m0, [r1]
178 mova m1, m0
179 mova m2, m0
180 punpcklbw m0, m7
181 punpckhbw m1, m7
182 pshufw m5, m0, 0x90
183 pshufw m6, m0, 0x41
184 mova m3, m2
185 mova m4, m2
186 psllq m2, 8
187 psllq m3, 16
188 psllq m4, 24
189 punpckhbw m2, m7
190 punpckhbw m3, m7
191 punpckhbw m4, m7
192 paddw m5, m3
193 paddw m6, m2
194 paddw m5, m5
195 psubw m6, m5
196 pshufw m5, m0, 6
197 pmullw m6, [pw_3]
198 paddw m0, m4
199 paddw m5, m1
200 pmullw m0, [pw_20]
201 psubw m0, m5
202 paddw m6, [PW_ROUND]
203 paddw m0, m6
204 psraw m0, 5
205 mova [rsp+8], m0
206 mova m0, [r1+5]
207 mova m5, m0
208 mova m6, m0
209 psrlq m0, 8
210 psrlq m5, 16
211 punpcklbw m0, m7
212 punpcklbw m5, m7
213 paddw m2, m0
214 paddw m3, m5
215 paddw m2, m2
216 psubw m3, m2
217 mova m2, m6
218 psrlq m6, 24
219 punpcklbw m2, m7
220 punpcklbw m6, m7
221 pmullw m3, [pw_3]
222 paddw m1, m2
223 paddw m4, m6
224 pmullw m1, [pw_20]
225 psubw m3, m4
226 paddw m1, [PW_ROUND]
227 paddw m3, m1
228 psraw m3, 5
229 mova m1, [rsp+8]
230 packuswb m1, m3
231 OP_MOV [r0], m1, m4
232 mova m1, [r1+9]
233 mova m4, m1
234 mova m3, m1
235 psrlq m1, 8
236 psrlq m4, 16
237 punpcklbw m1, m7
238 punpcklbw m4, m7
239 paddw m5, m1
240 paddw m0, m4
241 paddw m5, m5
242 psubw m0, m5
243 mova m5, m3
244 psrlq m3, 24
245 pmullw m0, [pw_3]
246 punpcklbw m3, m7
247 paddw m2, m3
248 psubw m0, m2
249 mova m2, m5
250 punpcklbw m2, m7
251 punpckhbw m5, m7
252 paddw m6, m2
253 pmullw m6, [pw_20]
254 paddw m0, [PW_ROUND]
255 paddw m0, m6
256 psraw m0, 5
257 paddw m3, m5
258 pshufw m6, m5, 0xf9
259 paddw m6, m4
260 pshufw m4, m5, 0xbe
261 pshufw m5, m5, 0x6f
262 paddw m4, m1
263 paddw m5, m2
264 paddw m6, m6
265 psubw m4, m6
266 pmullw m3, [pw_20]
267 pmullw m4, [pw_3]
268 psubw m3, m5
269 paddw m4, [PW_ROUND]
270 paddw m4, m3
271 psraw m4, 5
272 packuswb m0, m4
273 OP_MOV [r0+8], m0, m4
274 add r1, r3
275 add r0, r2
276 dec r4d
277 jne .loop
278 REP_RET
279 %endmacro
280
281 %macro PUT_OP 2-3
282 mova %1, %2
283 %endmacro
284
285 %macro AVG_OP 2-3
286 mova %3, %1
287 pavgb %2, %3
288 mova %1, %2
289 %endmacro
290
291 INIT_MMX mmxext
292 %define PW_ROUND pw_16
293 %define OP_MOV PUT_OP
294 MPEG4_QPEL16_H_LOWPASS put
295 %define PW_ROUND pw_16
296 %define OP_MOV AVG_OP
297 MPEG4_QPEL16_H_LOWPASS avg
298 %define PW_ROUND pw_15
299 %define OP_MOV PUT_OP
300 MPEG4_QPEL16_H_LOWPASS put_no_rnd
301
302
303
304 %macro MPEG4_QPEL8_H_LOWPASS 1
305 cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
306 movsxdifnidn r2, r2d
307 movsxdifnidn r3, r3d
308 pxor m7, m7
309 .loop:
310 mova m0, [r1]
311 mova m1, m0
312 mova m2, m0
313 punpcklbw m0, m7
314 punpckhbw m1, m7
315 pshufw m5, m0, 0x90
316 pshufw m6, m0, 0x41
317 mova m3, m2
318 mova m4, m2
319 psllq m2, 8
320 psllq m3, 16
321 psllq m4, 24
322 punpckhbw m2, m7
323 punpckhbw m3, m7
324 punpckhbw m4, m7
325 paddw m5, m3
326 paddw m6, m2
327 paddw m5, m5
328 psubw m6, m5
329 pshufw m5, m0, 0x6
330 pmullw m6, [pw_3]
331 paddw m0, m4
332 paddw m5, m1
333 pmullw m0, [pw_20]
334 psubw m0, m5
335 paddw m6, [PW_ROUND]
336 paddw m0, m6
337 psraw m0, 5
338 movh m5, [r1+5]
339 punpcklbw m5, m7
340 pshufw m6, m5, 0xf9
341 paddw m1, m5
342 paddw m2, m6
343 pshufw m6, m5, 0xbe
344 pshufw m5, m5, 0x6f
345 paddw m3, m6
346 paddw m4, m5
347 paddw m2, m2
348 psubw m3, m2
349 pmullw m1, [pw_20]
350 pmullw m3, [pw_3]
351 psubw m3, m4
352 paddw m1, [PW_ROUND]
353 paddw m3, m1
354 psraw m3, 5
355 packuswb m0, m3
356 OP_MOV [r0], m0, m4
357 add r1, r3
358 add r0, r2
359 dec r4d
360 jne .loop
361 REP_RET
362 %endmacro
363
364 INIT_MMX mmxext
365 %define PW_ROUND pw_16
366 %define OP_MOV PUT_OP
367 MPEG4_QPEL8_H_LOWPASS put
368 %define PW_ROUND pw_16
369 %define OP_MOV AVG_OP
370 MPEG4_QPEL8_H_LOWPASS avg
371 %define PW_ROUND pw_15
372 %define OP_MOV PUT_OP
373 MPEG4_QPEL8_H_LOWPASS put_no_rnd
374
375
376
377 %macro QPEL_V_LOW 5
378 paddw m0, m1
379 mova m4, [pw_20]
380 pmullw m4, m0
381 mova m0, %4
382 mova m5, %1
383 paddw m5, m0
384 psubw m4, m5
385 mova m5, %2
386 mova m6, %3
387 paddw m5, m3
388 paddw m6, m2
389 paddw m6, m6
390 psubw m5, m6
391 pmullw m5, [pw_3]
392 paddw m4, [PW_ROUND]
393 paddw m5, m4
394 psraw m5, 5
395 packuswb m5, m5
396 OP_MOV %5, m5, m7
397 SWAP 0,1,2,3
398 %endmacro
399
400 %macro MPEG4_QPEL16_V_LOWPASS 1
401 cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
402 movsxdifnidn r2, r2d
403 movsxdifnidn r3, r3d
404
405 mov r4d, 17
406 mov r5, rsp
407 pxor m7, m7
408 .looph:
409 mova m0, [r1]
410 mova m1, [r1]
411 mova m2, [r1+8]
412 mova m3, [r1+8]
413 punpcklbw m0, m7
414 punpckhbw m1, m7
415 punpcklbw m2, m7
416 punpckhbw m3, m7
417 mova [r5], m0
418 mova [r5+0x88], m1
419 mova [r5+0x110], m2
420 mova [r5+0x198], m3
421 add r5, 8
422 add r1, r3
423 dec r4d
424 jne .looph
425
426
427 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
428 mov r4d, 4
429 mov r1, 4
430 neg r2
431 lea r1, [r1+r2*8]
432 lea r1, [r1+r2*4]
433 lea r1, [r1+r2*2]
434 neg r2
435 mov r5, rsp
436 .loopv:
437 pxor m7, m7
438 mova m0, [r5+ 0x0]
439 mova m1, [r5+ 0x8]
440 mova m2, [r5+0x10]
441 mova m3, [r5+0x18]
442 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
443 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
444 lea r0, [r0+r2*2]
445 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
446 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
447 lea r0, [r0+r2*2]
448 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
449 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
450 lea r0, [r0+r2*2]
451 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
452 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
453 lea r0, [r0+r2*2]
454 QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
455 QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
456 lea r0, [r0+r2*2]
457 QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
458 QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
459 lea r0, [r0+r2*2]
460 QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
461 QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
462 lea r0, [r0+r2*2]
463 QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
464 QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
465
466 add r5, 0x88
467 add r0, r1
468 dec r4d
469 jne .loopv
470 REP_RET
471 %endmacro
472
473 %macro PUT_OPH 2-3
474 movh %1, %2
475 %endmacro
476
477 %macro AVG_OPH 2-3
478 movh %3, %1
479 pavgb %2, %3
480 movh %1, %2
481 %endmacro
482
483 INIT_MMX mmxext
484 %define PW_ROUND pw_16
485 %define OP_MOV PUT_OPH
486 MPEG4_QPEL16_V_LOWPASS put
487 %define PW_ROUND pw_16
488 %define OP_MOV AVG_OPH
489 MPEG4_QPEL16_V_LOWPASS avg
490 %define PW_ROUND pw_15
491 %define OP_MOV PUT_OPH
492 MPEG4_QPEL16_V_LOWPASS put_no_rnd
493
494
495
496 %macro MPEG4_QPEL8_V_LOWPASS 1
497 cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
498 movsxdifnidn r2, r2d
499 movsxdifnidn r3, r3d
500
501 mov r4d, 9
502 mov r5, rsp
503 pxor m7, m7
504 .looph:
505 mova m0, [r1]
506 mova m1, [r1]
507 punpcklbw m0, m7
508 punpckhbw m1, m7
509 mova [r5], m0
510 mova [r5+0x48], m1
511 add r5, 8
512 add r1, r3
513 dec r4d
514 jne .looph
515
516
517 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
518 mov r4d, 2
519 mov r1, 4
520 neg r2
521 lea r1, [r1+r2*4]
522 lea r1, [r1+r2*2]
523 neg r2
524 mov r5, rsp
525 .loopv:
526 pxor m7, m7
527 mova m0, [r5+ 0x0]
528 mova m1, [r5+ 0x8]
529 mova m2, [r5+0x10]
530 mova m3, [r5+0x18]
531 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
532 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
533 lea r0, [r0+r2*2]
534 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
535 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
536 lea r0, [r0+r2*2]
537 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
538 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
539 lea r0, [r0+r2*2]
540 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
541 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
542
543 add r5, 0x48
544 add r0, r1
545 dec r4d
546 jne .loopv
547 REP_RET
548 %endmacro
549
550 INIT_MMX mmxext
551 %define PW_ROUND pw_16
552 %define OP_MOV PUT_OPH
553 MPEG4_QPEL8_V_LOWPASS put
554 %define PW_ROUND pw_16
555 %define OP_MOV AVG_OPH
556 MPEG4_QPEL8_V_LOWPASS avg
557 %define PW_ROUND pw_15
558 %define OP_MOV PUT_OPH
559 MPEG4_QPEL8_V_LOWPASS put_no_rnd