x86: deduplicate some constants
[libav.git] / libavcodec / x86 / vp8dsp.asm
1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 ;*
6 ;* This file is part of Libav.
7 ;*
8 ;* Libav is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* Libav is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with Libav; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "libavutil/x86/x86util.asm"
24
25 SECTION_RODATA
26
27 fourtap_filter_hw_m: times 4 dw -6, 123
28 times 4 dw 12, -1
29 times 4 dw -9, 93
30 times 4 dw 50, -6
31 times 4 dw -6, 50
32 times 4 dw 93, -9
33 times 4 dw -1, 12
34 times 4 dw 123, -6
35
36 sixtap_filter_hw_m: times 4 dw 2, -11
37 times 4 dw 108, 36
38 times 4 dw -8, 1
39 times 4 dw 3, -16
40 times 4 dw 77, 77
41 times 4 dw -16, 3
42 times 4 dw 1, -8
43 times 4 dw 36, 108
44 times 4 dw -11, 2
45
46 fourtap_filter_hb_m: times 8 db -6, 123
47 times 8 db 12, -1
48 times 8 db -9, 93
49 times 8 db 50, -6
50 times 8 db -6, 50
51 times 8 db 93, -9
52 times 8 db -1, 12
53 times 8 db 123, -6
54
55 sixtap_filter_hb_m: times 8 db 2, 1
56 times 8 db -11, 108
57 times 8 db 36, -8
58 times 8 db 3, 3
59 times 8 db -16, 77
60 times 8 db 77, -16
61 times 8 db 1, 2
62 times 8 db -8, 36
63 times 8 db 108, -11
64
65 fourtap_filter_v_m: times 8 dw -6
66 times 8 dw 123
67 times 8 dw 12
68 times 8 dw -1
69 times 8 dw -9
70 times 8 dw 93
71 times 8 dw 50
72 times 8 dw -6
73 times 8 dw -6
74 times 8 dw 50
75 times 8 dw 93
76 times 8 dw -9
77 times 8 dw -1
78 times 8 dw 12
79 times 8 dw 123
80 times 8 dw -6
81
82 sixtap_filter_v_m: times 8 dw 2
83 times 8 dw -11
84 times 8 dw 108
85 times 8 dw 36
86 times 8 dw -8
87 times 8 dw 1
88 times 8 dw 3
89 times 8 dw -16
90 times 8 dw 77
91 times 8 dw 77
92 times 8 dw -16
93 times 8 dw 3
94 times 8 dw 1
95 times 8 dw -8
96 times 8 dw 36
97 times 8 dw 108
98 times 8 dw -11
99 times 8 dw 2
100
101 bilinear_filter_vw_m: times 8 dw 1
102 times 8 dw 2
103 times 8 dw 3
104 times 8 dw 4
105 times 8 dw 5
106 times 8 dw 6
107 times 8 dw 7
108
109 bilinear_filter_vb_m: times 8 db 7, 1
110 times 8 db 6, 2
111 times 8 db 5, 3
112 times 8 db 4, 4
113 times 8 db 3, 5
114 times 8 db 2, 6
115 times 8 db 1, 7
116
117 %ifdef PIC
118 %define fourtap_filter_hw picregq
119 %define sixtap_filter_hw picregq
120 %define fourtap_filter_hb picregq
121 %define sixtap_filter_hb picregq
122 %define fourtap_filter_v picregq
123 %define sixtap_filter_v picregq
124 %define bilinear_filter_vw picregq
125 %define bilinear_filter_vb picregq
126 %define npicregs 1
127 %else
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
136 %define npicregs 0
137 %endif
138
139 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
140 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
141
142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
145
146 pw_20091: times 4 dw 20091
147 pw_17734: times 4 dw 17734
148
149 cextern pw_3
150 cextern pw_4
151 cextern pw_64
152 cextern pw_256
153
154 SECTION .text
155
156 ;-------------------------------------------------------------------------------
157 ; subpel MC functions:
158 ;
159 ; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
160 ; uint8_t *src, int srcstride,
161 ; int height, int mx, int my);
162 ;-------------------------------------------------------------------------------
163
164 %macro FILTER_SSSE3 1
165 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
166 lea mxd, [mxq*3]
167 mova m3, [filter_h6_shuf2]
168 mova m4, [filter_h6_shuf3]
169 %ifdef PIC
170 lea picregq, [sixtap_filter_hb_m]
171 %endif
172 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
173 mova m6, [sixtap_filter_hb+mxq*8-32]
174 mova m7, [sixtap_filter_hb+mxq*8-16]
175
176 .nextrow:
177 movu m0, [srcq-2]
178 mova m1, m0
179 mova m2, m0
180 %if mmsize == 8
181 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
182 ; shuffle with a memory operand
183 punpcklbw m0, [srcq+3]
184 %else
185 pshufb m0, [filter_h6_shuf1]
186 %endif
187 pshufb m1, m3
188 pshufb m2, m4
189 pmaddubsw m0, m5
190 pmaddubsw m1, m6
191 pmaddubsw m2, m7
192 paddsw m0, m1
193 paddsw m0, m2
194 pmulhrsw m0, [pw_256]
195 packuswb m0, m0
196 movh [dstq], m0 ; store
197
198 ; go to next line
199 add dstq, dststrideq
200 add srcq, srcstrideq
201 dec heightd ; next row
202 jg .nextrow
203 REP_RET
204
205 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
206 shl mxd, 4
207 mova m2, [pw_256]
208 mova m3, [filter_h2_shuf]
209 mova m4, [filter_h4_shuf]
210 %ifdef PIC
211 lea picregq, [fourtap_filter_hb_m]
212 %endif
213 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
214 mova m6, [fourtap_filter_hb+mxq]
215
216 .nextrow:
217 movu m0, [srcq-1]
218 mova m1, m0
219 pshufb m0, m3
220 pshufb m1, m4
221 pmaddubsw m0, m5
222 pmaddubsw m1, m6
223 paddsw m0, m1
224 pmulhrsw m0, m2
225 packuswb m0, m0
226 movh [dstq], m0 ; store
227
228 ; go to next line
229 add dstq, dststrideq
230 add srcq, srcstrideq
231 dec heightd ; next row
232 jg .nextrow
233 REP_RET
234
235 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
236 shl myd, 4
237 %ifdef PIC
238 lea picregq, [fourtap_filter_hb_m]
239 %endif
240 mova m5, [fourtap_filter_hb+myq-16]
241 mova m6, [fourtap_filter_hb+myq]
242 mova m7, [pw_256]
243
244 ; read 3 lines
245 sub srcq, srcstrideq
246 movh m0, [srcq]
247 movh m1, [srcq+ srcstrideq]
248 movh m2, [srcq+2*srcstrideq]
249 add srcq, srcstrideq
250
251 .nextrow:
252 movh m3, [srcq+2*srcstrideq] ; read new row
253 mova m4, m0
254 mova m0, m1
255 punpcklbw m4, m1
256 mova m1, m2
257 punpcklbw m2, m3
258 pmaddubsw m4, m5
259 pmaddubsw m2, m6
260 paddsw m4, m2
261 mova m2, m3
262 pmulhrsw m4, m7
263 packuswb m4, m4
264 movh [dstq], m4
265
266 ; go to next line
267 add dstq, dststrideq
268 add srcq, srcstrideq
269 dec heightd ; next row
270 jg .nextrow
271 REP_RET
272
273 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
274 lea myd, [myq*3]
275 %ifdef PIC
276 lea picregq, [sixtap_filter_hb_m]
277 %endif
278 lea myq, [sixtap_filter_hb+myq*8]
279
280 ; read 5 lines
281 sub srcq, srcstrideq
282 sub srcq, srcstrideq
283 movh m0, [srcq]
284 movh m1, [srcq+srcstrideq]
285 movh m2, [srcq+srcstrideq*2]
286 lea srcq, [srcq+srcstrideq*2]
287 add srcq, srcstrideq
288 movh m3, [srcq]
289 movh m4, [srcq+srcstrideq]
290
291 .nextrow:
292 movh m5, [srcq+2*srcstrideq] ; read new row
293 mova m6, m0
294 punpcklbw m6, m5
295 mova m0, m1
296 punpcklbw m1, m2
297 mova m7, m3
298 punpcklbw m7, m4
299 pmaddubsw m6, [myq-48]
300 pmaddubsw m1, [myq-32]
301 pmaddubsw m7, [myq-16]
302 paddsw m6, m1
303 paddsw m6, m7
304 mova m1, m2
305 mova m2, m3
306 pmulhrsw m6, [pw_256]
307 mova m3, m4
308 packuswb m6, m6
309 mova m4, m5
310 movh [dstq], m6
311
312 ; go to next line
313 add dstq, dststrideq
314 add srcq, srcstrideq
315 dec heightd ; next row
316 jg .nextrow
317 REP_RET
318 %endmacro
319
320 INIT_MMX ssse3
321 FILTER_SSSE3 4
322 INIT_XMM ssse3
323 FILTER_SSSE3 8
324
325 ; 4x4 block, H-only 4-tap filter
326 INIT_MMX mmxext
327 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
328 shl mxd, 4
329 %ifdef PIC
330 lea picregq, [fourtap_filter_hw_m]
331 %endif
332 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
333 movq mm5, [fourtap_filter_hw+mxq]
334 movq mm7, [pw_64]
335 pxor mm6, mm6
336
337 .nextrow:
338 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
339
340 ; first set of 2 pixels
341 movq mm2, mm1 ; byte ABCD..
342 punpcklbw mm1, mm6 ; byte->word ABCD
343 pshufw mm0, mm2, 9 ; byte CDEF..
344 punpcklbw mm0, mm6 ; byte->word CDEF
345 pshufw mm3, mm1, 0x94 ; word ABBC
346 pshufw mm1, mm0, 0x94 ; word CDDE
347 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
348 movq mm0, mm1 ; backup for second set of pixels
349 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
350 paddd mm3, mm1 ; finish 1st 2px
351
352 ; second set of 2 pixels, use backup of above
353 punpckhbw mm2, mm6 ; byte->word EFGH
354 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
355 pshufw mm1, mm2, 0x94 ; word EFFG
356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
357 paddd mm0, mm1 ; finish 2nd 2px
358
359 ; merge two sets of 2 pixels into one set of 4, round/clip/store
360 packssdw mm3, mm0 ; merge dword->word (4px)
361 paddsw mm3, mm7 ; rounding
362 psraw mm3, 7
363 packuswb mm3, mm6 ; clip and word->bytes
364 movd [dstq], mm3 ; store
365
366 ; go to next line
367 add dstq, dststrideq
368 add srcq, srcstrideq
369 dec heightd ; next row
370 jg .nextrow
371 REP_RET
372
373 ; 4x4 block, H-only 6-tap filter
374 INIT_MMX mmxext
375 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
376 lea mxd, [mxq*3]
377 %ifdef PIC
378 lea picregq, [sixtap_filter_hw_m]
379 %endif
380 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
381 movq mm5, [sixtap_filter_hw+mxq*8-32]
382 movq mm6, [sixtap_filter_hw+mxq*8-16]
383 movq mm7, [pw_64]
384 pxor mm3, mm3
385
386 .nextrow:
387 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
388
389 ; first set of 2 pixels
390 movq mm2, mm1 ; byte ABCD..
391 punpcklbw mm1, mm3 ; byte->word ABCD
392 pshufw mm0, mm2, 0x9 ; byte CDEF..
393 punpckhbw mm2, mm3 ; byte->word EFGH
394 punpcklbw mm0, mm3 ; byte->word CDEF
395 pshufw mm1, mm1, 0x94 ; word ABBC
396 pshufw mm2, mm2, 0x94 ; word EFFG
397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
398 pshufw mm3, mm0, 0x94 ; word CDDE
399 movq mm0, mm3 ; backup for second set of pixels
400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
401 paddd mm1, mm3 ; add to 1st 2px cache
402 movq mm3, mm2 ; backup for second set of pixels
403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
404 paddd mm1, mm2 ; finish 1st 2px
405
406 ; second set of 2 pixels, use backup of above
407 movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
410 paddd mm0, mm3 ; add to 2nd 2px cache
411 pxor mm3, mm3
412 punpcklbw mm2, mm3 ; byte->word FGHI
413 pshufw mm2, mm2, 0xE9 ; word GHHI
414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
415 paddd mm0, mm2 ; finish 2nd 2px
416
417 ; merge two sets of 2 pixels into one set of 4, round/clip/store
418 packssdw mm1, mm0 ; merge dword->word (4px)
419 paddsw mm1, mm7 ; rounding
420 psraw mm1, 7
421 packuswb mm1, mm3 ; clip and word->bytes
422 movd [dstq], mm1 ; store
423
424 ; go to next line
425 add dstq, dststrideq
426 add srcq, srcstrideq
427 dec heightd ; next row
428 jg .nextrow
429 REP_RET
430
431 INIT_XMM sse2
432 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
433 shl mxd, 5
434 %ifdef PIC
435 lea picregq, [fourtap_filter_v_m]
436 %endif
437 lea mxq, [fourtap_filter_v+mxq-32]
438 pxor m7, m7
439 mova m4, [pw_64]
440 mova m5, [mxq+ 0]
441 mova m6, [mxq+16]
442 %ifdef m8
443 mova m8, [mxq+32]
444 mova m9, [mxq+48]
445 %endif
446 .nextrow:
447 movq m0, [srcq-1]
448 movq m1, [srcq-0]
449 movq m2, [srcq+1]
450 movq m3, [srcq+2]
451 punpcklbw m0, m7
452 punpcklbw m1, m7
453 punpcklbw m2, m7
454 punpcklbw m3, m7
455 pmullw m0, m5
456 pmullw m1, m6
457 %ifdef m8
458 pmullw m2, m8
459 pmullw m3, m9
460 %else
461 pmullw m2, [mxq+32]
462 pmullw m3, [mxq+48]
463 %endif
464 paddsw m0, m1
465 paddsw m2, m3
466 paddsw m0, m2
467 paddsw m0, m4
468 psraw m0, 7
469 packuswb m0, m7
470 movh [dstq], m0 ; store
471
472 ; go to next line
473 add dstq, dststrideq
474 add srcq, srcstrideq
475 dec heightd ; next row
476 jg .nextrow
477 REP_RET
478
479 INIT_XMM sse2
480 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
481 lea mxd, [mxq*3]
482 shl mxd, 4
483 %ifdef PIC
484 lea picregq, [sixtap_filter_v_m]
485 %endif
486 lea mxq, [sixtap_filter_v+mxq-96]
487 pxor m7, m7
488 mova m6, [pw_64]
489 %ifdef m8
490 mova m8, [mxq+ 0]
491 mova m9, [mxq+16]
492 mova m10, [mxq+32]
493 mova m11, [mxq+48]
494 mova m12, [mxq+64]
495 mova m13, [mxq+80]
496 %endif
497 .nextrow:
498 movq m0, [srcq-2]
499 movq m1, [srcq-1]
500 movq m2, [srcq-0]
501 movq m3, [srcq+1]
502 movq m4, [srcq+2]
503 movq m5, [srcq+3]
504 punpcklbw m0, m7
505 punpcklbw m1, m7
506 punpcklbw m2, m7
507 punpcklbw m3, m7
508 punpcklbw m4, m7
509 punpcklbw m5, m7
510 %ifdef m8
511 pmullw m0, m8
512 pmullw m1, m9
513 pmullw m2, m10
514 pmullw m3, m11
515 pmullw m4, m12
516 pmullw m5, m13
517 %else
518 pmullw m0, [mxq+ 0]
519 pmullw m1, [mxq+16]
520 pmullw m2, [mxq+32]
521 pmullw m3, [mxq+48]
522 pmullw m4, [mxq+64]
523 pmullw m5, [mxq+80]
524 %endif
525 paddsw m1, m4
526 paddsw m0, m5
527 paddsw m1, m2
528 paddsw m0, m3
529 paddsw m0, m1
530 paddsw m0, m6
531 psraw m0, 7
532 packuswb m0, m7
533 movh [dstq], m0 ; store
534
535 ; go to next line
536 add dstq, dststrideq
537 add srcq, srcstrideq
538 dec heightd ; next row
539 jg .nextrow
540 REP_RET
541
542 %macro FILTER_V 1
543 ; 4x4 block, V-only 4-tap filter
544 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
545 shl myd, 5
546 %ifdef PIC
547 lea picregq, [fourtap_filter_v_m]
548 %endif
549 lea myq, [fourtap_filter_v+myq-32]
550 mova m6, [pw_64]
551 pxor m7, m7
552 mova m5, [myq+48]
553
554 ; read 3 lines
555 sub srcq, srcstrideq
556 movh m0, [srcq]
557 movh m1, [srcq+ srcstrideq]
558 movh m2, [srcq+2*srcstrideq]
559 add srcq, srcstrideq
560 punpcklbw m0, m7
561 punpcklbw m1, m7
562 punpcklbw m2, m7
563
564 .nextrow:
565 ; first calculate negative taps (to prevent losing positive overflows)
566 movh m4, [srcq+2*srcstrideq] ; read new row
567 punpcklbw m4, m7
568 mova m3, m4
569 pmullw m0, [myq+0]
570 pmullw m4, m5
571 paddsw m4, m0
572
573 ; then calculate positive taps
574 mova m0, m1
575 pmullw m1, [myq+16]
576 paddsw m4, m1
577 mova m1, m2
578 pmullw m2, [myq+32]
579 paddsw m4, m2
580 mova m2, m3
581
582 ; round/clip/store
583 paddsw m4, m6
584 psraw m4, 7
585 packuswb m4, m7
586 movh [dstq], m4
587
588 ; go to next line
589 add dstq, dststrideq
590 add srcq, srcstrideq
591 dec heightd ; next row
592 jg .nextrow
593 REP_RET
594
595
596 ; 4x4 block, V-only 6-tap filter
597 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
598 shl myd, 4
599 lea myq, [myq*3]
600 %ifdef PIC
601 lea picregq, [sixtap_filter_v_m]
602 %endif
603 lea myq, [sixtap_filter_v+myq-96]
604 pxor m7, m7
605
606 ; read 5 lines
607 sub srcq, srcstrideq
608 sub srcq, srcstrideq
609 movh m0, [srcq]
610 movh m1, [srcq+srcstrideq]
611 movh m2, [srcq+srcstrideq*2]
612 lea srcq, [srcq+srcstrideq*2]
613 add srcq, srcstrideq
614 movh m3, [srcq]
615 movh m4, [srcq+srcstrideq]
616 punpcklbw m0, m7
617 punpcklbw m1, m7
618 punpcklbw m2, m7
619 punpcklbw m3, m7
620 punpcklbw m4, m7
621
622 .nextrow:
623 ; first calculate negative taps (to prevent losing positive overflows)
624 mova m5, m1
625 pmullw m5, [myq+16]
626 mova m6, m4
627 pmullw m6, [myq+64]
628 paddsw m6, m5
629
630 ; then calculate positive taps
631 movh m5, [srcq+2*srcstrideq] ; read new row
632 punpcklbw m5, m7
633 pmullw m0, [myq+0]
634 paddsw m6, m0
635 mova m0, m1
636 mova m1, m2
637 pmullw m2, [myq+32]
638 paddsw m6, m2
639 mova m2, m3
640 pmullw m3, [myq+48]
641 paddsw m6, m3
642 mova m3, m4
643 mova m4, m5
644 pmullw m5, [myq+80]
645 paddsw m6, m5
646
647 ; round/clip/store
648 paddsw m6, [pw_64]
649 psraw m6, 7
650 packuswb m6, m7
651 movh [dstq], m6
652
653 ; go to next line
654 add dstq, dststrideq
655 add srcq, srcstrideq
656 dec heightd ; next row
657 jg .nextrow
658 REP_RET
659 %endmacro
660
661 INIT_MMX mmxext
662 FILTER_V 4
663 INIT_XMM sse2
664 FILTER_V 8
665
666 %macro FILTER_BILINEAR 1
667 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
668 shl myd, 4
669 %ifdef PIC
670 lea picregq, [bilinear_filter_vw_m]
671 %endif
672 pxor m6, m6
673 mova m5, [bilinear_filter_vw+myq-1*16]
674 neg myq
675 mova m4, [bilinear_filter_vw+myq+7*16]
676 .nextrow:
677 movh m0, [srcq+srcstrideq*0]
678 movh m1, [srcq+srcstrideq*1]
679 movh m3, [srcq+srcstrideq*2]
680 punpcklbw m0, m6
681 punpcklbw m1, m6
682 punpcklbw m3, m6
683 mova m2, m1
684 pmullw m0, m4
685 pmullw m1, m5
686 pmullw m2, m4
687 pmullw m3, m5
688 paddsw m0, m1
689 paddsw m2, m3
690 psraw m0, 2
691 psraw m2, 2
692 pavgw m0, m6
693 pavgw m2, m6
694 %if mmsize == 8
695 packuswb m0, m0
696 packuswb m2, m2
697 movh [dstq+dststrideq*0], m0
698 movh [dstq+dststrideq*1], m2
699 %else
700 packuswb m0, m2
701 movh [dstq+dststrideq*0], m0
702 movhps [dstq+dststrideq*1], m0
703 %endif
704
705 lea dstq, [dstq+dststrideq*2]
706 lea srcq, [srcq+srcstrideq*2]
707 sub heightd, 2
708 jg .nextrow
709 REP_RET
710
711 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
712 shl mxd, 4
713 %ifdef PIC
714 lea picregq, [bilinear_filter_vw_m]
715 %endif
716 pxor m6, m6
717 mova m5, [bilinear_filter_vw+mxq-1*16]
718 neg mxq
719 mova m4, [bilinear_filter_vw+mxq+7*16]
720 .nextrow:
721 movh m0, [srcq+srcstrideq*0+0]
722 movh m1, [srcq+srcstrideq*0+1]
723 movh m2, [srcq+srcstrideq*1+0]
724 movh m3, [srcq+srcstrideq*1+1]
725 punpcklbw m0, m6
726 punpcklbw m1, m6
727 punpcklbw m2, m6
728 punpcklbw m3, m6
729 pmullw m0, m4
730 pmullw m1, m5
731 pmullw m2, m4
732 pmullw m3, m5
733 paddsw m0, m1
734 paddsw m2, m3
735 psraw m0, 2
736 psraw m2, 2
737 pavgw m0, m6
738 pavgw m2, m6
739 %if mmsize == 8
740 packuswb m0, m0
741 packuswb m2, m2
742 movh [dstq+dststrideq*0], m0
743 movh [dstq+dststrideq*1], m2
744 %else
745 packuswb m0, m2
746 movh [dstq+dststrideq*0], m0
747 movhps [dstq+dststrideq*1], m0
748 %endif
749
750 lea dstq, [dstq+dststrideq*2]
751 lea srcq, [srcq+srcstrideq*2]
752 sub heightd, 2
753 jg .nextrow
754 REP_RET
755 %endmacro
756
757 INIT_MMX mmxext
758 FILTER_BILINEAR 4
759 INIT_XMM sse2
760 FILTER_BILINEAR 8
761
762 %macro FILTER_BILINEAR_SSSE3 1
763 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
764 shl myd, 4
765 %ifdef PIC
766 lea picregq, [bilinear_filter_vb_m]
767 %endif
768 pxor m4, m4
769 mova m3, [bilinear_filter_vb+myq-16]
770 .nextrow:
771 movh m0, [srcq+srcstrideq*0]
772 movh m1, [srcq+srcstrideq*1]
773 movh m2, [srcq+srcstrideq*2]
774 punpcklbw m0, m1
775 punpcklbw m1, m2
776 pmaddubsw m0, m3
777 pmaddubsw m1, m3
778 psraw m0, 2
779 psraw m1, 2
780 pavgw m0, m4
781 pavgw m1, m4
782 %if mmsize==8
783 packuswb m0, m0
784 packuswb m1, m1
785 movh [dstq+dststrideq*0], m0
786 movh [dstq+dststrideq*1], m1
787 %else
788 packuswb m0, m1
789 movh [dstq+dststrideq*0], m0
790 movhps [dstq+dststrideq*1], m0
791 %endif
792
793 lea dstq, [dstq+dststrideq*2]
794 lea srcq, [srcq+srcstrideq*2]
795 sub heightd, 2
796 jg .nextrow
797 REP_RET
798
799 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
800 shl mxd, 4
801 %ifdef PIC
802 lea picregq, [bilinear_filter_vb_m]
803 %endif
804 pxor m4, m4
805 mova m2, [filter_h2_shuf]
806 mova m3, [bilinear_filter_vb+mxq-16]
807 .nextrow:
808 movu m0, [srcq+srcstrideq*0]
809 movu m1, [srcq+srcstrideq*1]
810 pshufb m0, m2
811 pshufb m1, m2
812 pmaddubsw m0, m3
813 pmaddubsw m1, m3
814 psraw m0, 2
815 psraw m1, 2
816 pavgw m0, m4
817 pavgw m1, m4
818 %if mmsize==8
819 packuswb m0, m0
820 packuswb m1, m1
821 movh [dstq+dststrideq*0], m0
822 movh [dstq+dststrideq*1], m1
823 %else
824 packuswb m0, m1
825 movh [dstq+dststrideq*0], m0
826 movhps [dstq+dststrideq*1], m0
827 %endif
828
829 lea dstq, [dstq+dststrideq*2]
830 lea srcq, [srcq+srcstrideq*2]
831 sub heightd, 2
832 jg .nextrow
833 REP_RET
834 %endmacro
835
836 INIT_MMX ssse3
837 FILTER_BILINEAR_SSSE3 4
838 INIT_XMM ssse3
839 FILTER_BILINEAR_SSSE3 8
840
841 INIT_MMX mmx
842 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
843 .nextrow:
844 movq mm0, [srcq+srcstrideq*0]
845 movq mm1, [srcq+srcstrideq*1]
846 lea srcq, [srcq+srcstrideq*2]
847 movq [dstq+dststrideq*0], mm0
848 movq [dstq+dststrideq*1], mm1
849 lea dstq, [dstq+dststrideq*2]
850 sub heightd, 2
851 jg .nextrow
852 REP_RET
853
854 %if ARCH_X86_32
855 INIT_MMX mmx
856 cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
857 .nextrow:
858 movq mm0, [srcq+srcstrideq*0+0]
859 movq mm1, [srcq+srcstrideq*0+8]
860 movq mm2, [srcq+srcstrideq*1+0]
861 movq mm3, [srcq+srcstrideq*1+8]
862 lea srcq, [srcq+srcstrideq*2]
863 movq [dstq+dststrideq*0+0], mm0
864 movq [dstq+dststrideq*0+8], mm1
865 movq [dstq+dststrideq*1+0], mm2
866 movq [dstq+dststrideq*1+8], mm3
867 lea dstq, [dstq+dststrideq*2]
868 sub heightd, 2
869 jg .nextrow
870 REP_RET
871 %endif
872
873 INIT_XMM sse
874 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
875 .nextrow:
876 movups xmm0, [srcq+srcstrideq*0]
877 movups xmm1, [srcq+srcstrideq*1]
878 lea srcq, [srcq+srcstrideq*2]
879 movaps [dstq+dststrideq*0], xmm0
880 movaps [dstq+dststrideq*1], xmm1
881 lea dstq, [dstq+dststrideq*2]
882 sub heightd, 2
883 jg .nextrow
884 REP_RET
885
886 ;-----------------------------------------------------------------------------
887 ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
888 ;-----------------------------------------------------------------------------
889
890 %macro ADD_DC 4
891 %4 m2, [dst1q+%3]
892 %4 m3, [dst1q+strideq+%3]
893 %4 m4, [dst2q+%3]
894 %4 m5, [dst2q+strideq+%3]
895 paddusb m2, %1
896 paddusb m3, %1
897 paddusb m4, %1
898 paddusb m5, %1
899 psubusb m2, %2
900 psubusb m3, %2
901 psubusb m4, %2
902 psubusb m5, %2
903 %4 [dst1q+%3], m2
904 %4 [dst1q+strideq+%3], m3
905 %4 [dst2q+%3], m4
906 %4 [dst2q+strideq+%3], m5
907 %endmacro
908
909 INIT_MMX mmx
910 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
911 ; load data
912 movd m0, [blockq]
913
914 ; calculate DC
915 paddw m0, [pw_4]
916 pxor m1, m1
917 psraw m0, 3
918 movd [blockq], m1
919 psubw m1, m0
920 packuswb m0, m0
921 packuswb m1, m1
922 punpcklbw m0, m0
923 punpcklbw m1, m1
924 punpcklwd m0, m0
925 punpcklwd m1, m1
926
927 ; add DC
928 DEFINE_ARGS dst1, dst2, stride
929 lea dst2q, [dst1q+strideq*2]
930 ADD_DC m0, m1, 0, movh
931 RET
932
933 INIT_XMM sse4
934 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
935 ; load data
936 movd m0, [blockq]
937 pxor m1, m1
938
939 ; calculate DC
940 paddw m0, [pw_4]
941 movd [blockq], m1
942 DEFINE_ARGS dst1, dst2, stride
943 lea dst2q, [dst1q+strideq*2]
944 movd m2, [dst1q]
945 movd m3, [dst1q+strideq]
946 movd m4, [dst2q]
947 movd m5, [dst2q+strideq]
948 psraw m0, 3
949 pshuflw m0, m0, 0
950 punpcklqdq m0, m0
951 punpckldq m2, m3
952 punpckldq m4, m5
953 punpcklbw m2, m1
954 punpcklbw m4, m1
955 paddw m2, m0
956 paddw m4, m0
957 packuswb m2, m4
958 movd [dst1q], m2
959 pextrd [dst1q+strideq], m2, 1
960 pextrd [dst2q], m2, 2
961 pextrd [dst2q+strideq], m2, 3
962 RET
963
964 ;-----------------------------------------------------------------------------
965 ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
966 ;-----------------------------------------------------------------------------
967
968 %if ARCH_X86_32
969 INIT_MMX mmx
970 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
971 ; load data
972 movd m0, [blockq+32*0] ; A
973 movd m1, [blockq+32*2] ; C
974 punpcklwd m0, [blockq+32*1] ; A B
975 punpcklwd m1, [blockq+32*3] ; C D
976 punpckldq m0, m1 ; A B C D
977 pxor m6, m6
978
979 ; calculate DC
980 paddw m0, [pw_4]
981 movd [blockq+32*0], m6
982 movd [blockq+32*1], m6
983 movd [blockq+32*2], m6
984 movd [blockq+32*3], m6
985 psraw m0, 3
986 psubw m6, m0
987 packuswb m0, m0
988 packuswb m6, m6
989 punpcklbw m0, m0 ; AABBCCDD
990 punpcklbw m6, m6 ; AABBCCDD
991 movq m1, m0
992 movq m7, m6
993 punpcklbw m0, m0 ; AAAABBBB
994 punpckhbw m1, m1 ; CCCCDDDD
995 punpcklbw m6, m6 ; AAAABBBB
996 punpckhbw m7, m7 ; CCCCDDDD
997
998 ; add DC
999 DEFINE_ARGS dst1, dst2, stride
1000 lea dst2q, [dst1q+strideq*2]
1001 ADD_DC m0, m6, 0, mova
1002 ADD_DC m1, m7, 8, mova
1003 RET
1004 %endif
1005
1006 INIT_XMM sse2
1007 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1008 ; load data
1009 movd m0, [blockq+32*0] ; A
1010 movd m1, [blockq+32*2] ; C
1011 punpcklwd m0, [blockq+32*1] ; A B
1012 punpcklwd m1, [blockq+32*3] ; C D
1013 punpckldq m0, m1 ; A B C D
1014 pxor m1, m1
1015
1016 ; calculate DC
1017 paddw m0, [pw_4]
1018 movd [blockq+32*0], m1
1019 movd [blockq+32*1], m1
1020 movd [blockq+32*2], m1
1021 movd [blockq+32*3], m1
1022 psraw m0, 3
1023 psubw m1, m0
1024 packuswb m0, m0
1025 packuswb m1, m1
1026 punpcklbw m0, m0
1027 punpcklbw m1, m1
1028 punpcklbw m0, m0
1029 punpcklbw m1, m1
1030
1031 ; add DC
1032 DEFINE_ARGS dst1, dst2, stride
1033 lea dst2q, [dst1q+strideq*2]
1034 ADD_DC m0, m1, 0, mova
1035 RET
1036
1037 ;-----------------------------------------------------------------------------
1038 ; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
1039 ;-----------------------------------------------------------------------------
1040
1041 INIT_MMX mmx
1042 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1043 ; load data
1044 movd m0, [blockq+32*0] ; A
1045 movd m1, [blockq+32*2] ; C
1046 punpcklwd m0, [blockq+32*1] ; A B
1047 punpcklwd m1, [blockq+32*3] ; C D
1048 punpckldq m0, m1 ; A B C D
1049 pxor m6, m6
1050
1051 ; calculate DC
1052 paddw m0, [pw_4]
1053 movd [blockq+32*0], m6
1054 movd [blockq+32*1], m6
1055 movd [blockq+32*2], m6
1056 movd [blockq+32*3], m6
1057 psraw m0, 3
1058 psubw m6, m0
1059 packuswb m0, m0
1060 packuswb m6, m6
1061 punpcklbw m0, m0 ; AABBCCDD
1062 punpcklbw m6, m6 ; AABBCCDD
1063 movq m1, m0
1064 movq m7, m6
1065 punpcklbw m0, m0 ; AAAABBBB
1066 punpckhbw m1, m1 ; CCCCDDDD
1067 punpcklbw m6, m6 ; AAAABBBB
1068 punpckhbw m7, m7 ; CCCCDDDD
1069
1070 ; add DC
1071 DEFINE_ARGS dst1, dst2, stride
1072 lea dst2q, [dst1q+strideq*2]
1073 ADD_DC m0, m6, 0, mova
1074 lea dst1q, [dst1q+strideq*4]
1075 lea dst2q, [dst2q+strideq*4]
1076 ADD_DC m1, m7, 0, mova
1077 RET
1078
1079 ;-----------------------------------------------------------------------------
1080 ; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
1081 ;-----------------------------------------------------------------------------
1082
1083 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1084 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
1085 %macro VP8_MULTIPLY_SUMSUB 4
1086 mova %3, %1
1087 mova %4, %2
1088 pmulhw %3, m6 ;20091(1)
1089 pmulhw %4, m6 ;20091(2)
1090 paddw %3, %1
1091 paddw %4, %2
1092 paddw %1, %1
1093 paddw %2, %2
1094 pmulhw %1, m7 ;35468(1)
1095 pmulhw %2, m7 ;35468(2)
1096 psubw %1, %4
1097 paddw %2, %3
1098 %endmacro
1099
1100 ; calculate x0=%1+%3; x1=%1-%3
1101 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1102 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1103 ; %5/%6 are temporary registers
1104 ; we assume m6/m7 have constant words 20091/17734 loaded in them
1105 %macro VP8_IDCT_TRANSFORM4x4_1D 6
1106 SUMSUB_BA w, %3, %1, %5 ;t0, t1
1107 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1108 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
1109 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
1110 SWAP %4, %1
1111 SWAP %4, %3
1112 %endmacro
1113
1114 %macro VP8_IDCT_ADD 0
1115 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1116 ; load block data
1117 movq m0, [blockq+ 0]
1118 movq m1, [blockq+ 8]
1119 movq m2, [blockq+16]
1120 movq m3, [blockq+24]
1121 movq m6, [pw_20091]
1122 movq m7, [pw_17734]
1123 %if cpuflag(sse)
1124 xorps xmm0, xmm0
1125 movaps [blockq+ 0], xmm0
1126 movaps [blockq+16], xmm0
1127 %else
1128 pxor m4, m4
1129 movq [blockq+ 0], m4
1130 movq [blockq+ 8], m4
1131 movq [blockq+16], m4
1132 movq [blockq+24], m4
1133 %endif
1134
1135 ; actual IDCT
1136 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1137 TRANSPOSE4x4W 0, 1, 2, 3, 4
1138 paddw m0, [pw_4]
1139 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1140 TRANSPOSE4x4W 0, 1, 2, 3, 4
1141
1142 ; store
1143 pxor m4, m4
1144 DEFINE_ARGS dst1, dst2, stride
1145 lea dst2q, [dst1q+2*strideq]
1146 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1147 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1148
1149 RET
1150 %endmacro
1151
1152 %if ARCH_X86_32
1153 INIT_MMX mmx
1154 VP8_IDCT_ADD
1155 %endif
1156 INIT_MMX sse
1157 VP8_IDCT_ADD
1158
1159 ;-----------------------------------------------------------------------------
1160 ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1161 ;-----------------------------------------------------------------------------
1162
1163 %macro SCATTER_WHT 3
1164 movd dc1d, m%1
1165 movd dc2d, m%2
1166 mov [blockq+2*16*(0+%3)], dc1w
1167 mov [blockq+2*16*(1+%3)], dc2w
1168 shr dc1d, 16
1169 shr dc2d, 16
1170 psrlq m%1, 32
1171 psrlq m%2, 32
1172 mov [blockq+2*16*(4+%3)], dc1w
1173 mov [blockq+2*16*(5+%3)], dc2w
1174 movd dc1d, m%1
1175 movd dc2d, m%2
1176 mov [blockq+2*16*(8+%3)], dc1w
1177 mov [blockq+2*16*(9+%3)], dc2w
1178 shr dc1d, 16
1179 shr dc2d, 16
1180 mov [blockq+2*16*(12+%3)], dc1w
1181 mov [blockq+2*16*(13+%3)], dc2w
1182 %endmacro
1183
1184 %macro HADAMARD4_1D 4
1185 SUMSUB_BADC w, %2, %1, %4, %3
1186 SUMSUB_BADC w, %4, %2, %3, %1
1187 SWAP %1, %4, %3
1188 %endmacro
1189
1190 %macro VP8_DC_WHT 0
1191 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1192 movq m0, [dc1q]
1193 movq m1, [dc1q+8]
1194 movq m2, [dc1q+16]
1195 movq m3, [dc1q+24]
1196 %if cpuflag(sse)
1197 xorps xmm0, xmm0
1198 movaps [dc1q+ 0], xmm0
1199 movaps [dc1q+16], xmm0
1200 %else
1201 pxor m4, m4
1202 movq [dc1q+ 0], m4
1203 movq [dc1q+ 8], m4
1204 movq [dc1q+16], m4
1205 movq [dc1q+24], m4
1206 %endif
1207 HADAMARD4_1D 0, 1, 2, 3
1208 TRANSPOSE4x4W 0, 1, 2, 3, 4
1209 paddw m0, [pw_3]
1210 HADAMARD4_1D 0, 1, 2, 3
1211 psraw m0, 3
1212 psraw m1, 3
1213 psraw m2, 3
1214 psraw m3, 3
1215 SCATTER_WHT 0, 1, 0
1216 SCATTER_WHT 2, 3, 2
1217 RET
1218 %endmacro
1219
1220 %if ARCH_X86_32
1221 INIT_MMX mmx
1222 VP8_DC_WHT
1223 %endif
1224 INIT_MMX sse
1225 VP8_DC_WHT