x86: rv40dsp: Use PAVGB instruction macro where appropriate
[libav.git] / libavcodec / x86 / rv40dsp.asm
1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6 ;*
7 ;* This file is part of Libav.
8 ;*
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
13 ;*
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
23
24 %include "libavutil/x86/x86util.asm"
25
26 SECTION_RODATA
27
28 align 16
29 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
30
31 sixtap_filter_hb_m: times 8 db 1, -5
32 times 8 db 52, 20
33 ; multiplied by 2 to have the same shift
34 times 8 db 2, -10
35 times 8 db 40, 40
36 ; back to normal
37 times 8 db 1, -5
38 times 8 db 20, 52
39
40 sixtap_filter_v_m: times 8 dw 1
41 times 8 dw -5
42 times 8 dw 52
43 times 8 dw 20
44 ; multiplied by 2 to have the same shift
45 times 8 dw 2
46 times 8 dw -10
47 times 8 dw 40
48 times 8 dw 40
49 ; back to normal
50 times 8 dw 1
51 times 8 dw -5
52 times 8 dw 20
53 times 8 dw 52
54
55 %ifdef PIC
56 %define sixtap_filter_hw picregq
57 %define sixtap_filter_hb picregq
58 %define sixtap_filter_v picregq
59 %define npicregs 1
60 %else
61 %define sixtap_filter_hw sixtap_filter_hw_m
62 %define sixtap_filter_hb sixtap_filter_hb_m
63 %define sixtap_filter_v sixtap_filter_v_m
64 %define npicregs 0
65 %endif
66
67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
70
71 cextern pw_32
72 cextern pw_16
73 cextern pw_512
74
75 SECTION .text
76
77 ;-----------------------------------------------------------------------------
78 ; subpel MC functions:
79 ;
80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
81 ; uint8_t *src, int srcstride,
82 ; int len, int m);
83 ;----------------------------------------------------------------------
84 %macro LOAD 2
85 %if WIN64
86 movsxd %1q, %1d
87 %endif
88 %ifdef PIC
89 add %1q, picregq
90 %else
91 add %1q, %2
92 %endif
93 %endmacro
94
95 %macro STORE 3
96 %ifidn %3, avg
97 movh %2, [dstq]
98 %endif
99 packuswb %1, %1
100 %ifidn %3, avg
101 PAVGB %1, %2
102 %endif
103 movh [dstq], %1
104 %endmacro
105
106 %macro FILTER_V 1
107 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
108 %ifdef PIC
109 lea picregq, [sixtap_filter_v_m]
110 %endif
111 pxor m7, m7
112 LOAD my, sixtap_filter_v
113
114 ; read 5 lines
115 sub srcq, srcstrideq
116 sub srcq, srcstrideq
117 movh m0, [srcq]
118 movh m1, [srcq+srcstrideq]
119 movh m2, [srcq+srcstrideq*2]
120 lea srcq, [srcq+srcstrideq*2]
121 add srcq, srcstrideq
122 movh m3, [srcq]
123 movh m4, [srcq+srcstrideq]
124 punpcklbw m0, m7
125 punpcklbw m1, m7
126 punpcklbw m2, m7
127 punpcklbw m3, m7
128 punpcklbw m4, m7
129
130 %ifdef m8
131 mova m8, [myq+ 0]
132 mova m9, [myq+16]
133 mova m10, [myq+32]
134 mova m11, [myq+48]
135 %define COEFF05 m8
136 %define COEFF14 m9
137 %define COEFF2 m10
138 %define COEFF3 m11
139 %else
140 %define COEFF05 [myq+ 0]
141 %define COEFF14 [myq+16]
142 %define COEFF2 [myq+32]
143 %define COEFF3 [myq+48]
144 %endif
145 .nextrow:
146 mova m6, m1
147 movh m5, [srcq+2*srcstrideq] ; read new row
148 paddw m6, m4
149 punpcklbw m5, m7
150 pmullw m6, COEFF14
151 paddw m0, m5
152 pmullw m0, COEFF05
153 paddw m6, m0
154 mova m0, m1
155 paddw m6, [pw_32]
156 mova m1, m2
157 pmullw m2, COEFF2
158 paddw m6, m2
159 mova m2, m3
160 pmullw m3, COEFF3
161 paddw m6, m3
162
163 ; round/clip/store
164 mova m3, m4
165 psraw m6, 6
166 mova m4, m5
167 STORE m6, m5, %1
168
169 ; go to next line
170 add dstq, dststrideq
171 add srcq, srcstrideq
172 dec heightd ; next row
173 jg .nextrow
174 REP_RET
175 %endmacro
176
177 %macro FILTER_H 1
178 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
179 %ifdef PIC
180 lea picregq, [sixtap_filter_v_m]
181 %endif
182 pxor m7, m7
183 LOAD mx, sixtap_filter_v
184 mova m6, [pw_32]
185 %ifdef m8
186 mova m8, [mxq+ 0]
187 mova m9, [mxq+16]
188 mova m10, [mxq+32]
189 mova m11, [mxq+48]
190 %define COEFF05 m8
191 %define COEFF14 m9
192 %define COEFF2 m10
193 %define COEFF3 m11
194 %else
195 %define COEFF05 [mxq+ 0]
196 %define COEFF14 [mxq+16]
197 %define COEFF2 [mxq+32]
198 %define COEFF3 [mxq+48]
199 %endif
200 .nextrow:
201 movq m0, [srcq-2]
202 movq m5, [srcq+3]
203 movq m1, [srcq-1]
204 movq m4, [srcq+2]
205 punpcklbw m0, m7
206 punpcklbw m5, m7
207 punpcklbw m1, m7
208 punpcklbw m4, m7
209 movq m2, [srcq-0]
210 movq m3, [srcq+1]
211 paddw m0, m5
212 paddw m1, m4
213 punpcklbw m2, m7
214 punpcklbw m3, m7
215 pmullw m0, COEFF05
216 pmullw m1, COEFF14
217 pmullw m2, COEFF2
218 pmullw m3, COEFF3
219 paddw m0, m6
220 paddw m1, m2
221 paddw m0, m3
222 paddw m0, m1
223 psraw m0, 6
224 STORE m0, m1, %1
225
226 ; go to next line
227 add dstq, dststrideq
228 add srcq, srcstrideq
229 dec heightd ; next row
230 jg .nextrow
231 REP_RET
232 %endmacro
233
234 %if ARCH_X86_32
235 INIT_MMX mmx
236 FILTER_V put
237 FILTER_H put
238
239 INIT_MMX mmxext
240 FILTER_V avg
241 FILTER_H avg
242
243 INIT_MMX 3dnow
244 FILTER_V avg
245 FILTER_H avg
246 %endif
247
248 INIT_XMM sse2
249 FILTER_H put
250 FILTER_H avg
251 FILTER_V put
252 FILTER_V avg
253
254 %macro FILTER_SSSE3 1
255 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
256 %ifdef PIC
257 lea picregq, [sixtap_filter_hb_m]
258 %endif
259
260 ; read 5 lines
261 sub srcq, srcstrideq
262 LOAD my, sixtap_filter_hb
263 sub srcq, srcstrideq
264 movh m0, [srcq]
265 movh m1, [srcq+srcstrideq]
266 movh m2, [srcq+srcstrideq*2]
267 lea srcq, [srcq+srcstrideq*2]
268 add srcq, srcstrideq
269 mova m5, [myq]
270 movh m3, [srcq]
271 movh m4, [srcq+srcstrideq]
272 lea srcq, [srcq+2*srcstrideq]
273
274 .nextrow:
275 mova m6, m2
276 punpcklbw m0, m1
277 punpcklbw m6, m3
278 pmaddubsw m0, m5
279 pmaddubsw m6, [myq+16]
280 movh m7, [srcq] ; read new row
281 paddw m6, m0
282 mova m0, m1
283 mova m1, m2
284 mova m2, m3
285 mova m3, m4
286 mova m4, m7
287 punpcklbw m7, m3
288 pmaddubsw m7, m5
289 paddw m6, m7
290 pmulhrsw m6, [pw_512]
291 STORE m6, m7, %1
292
293 ; go to next line
294 add dstq, dststrideq
295 add srcq, srcstrideq
296 dec heightd ; next row
297 jg .nextrow
298 REP_RET
299
300 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
301 %ifdef PIC
302 lea picregq, [sixtap_filter_hb_m]
303 %endif
304 mova m3, [filter_h6_shuf2]
305 mova m4, [filter_h6_shuf3]
306 LOAD mx, sixtap_filter_hb
307 mova m5, [mxq] ; set up 6tap filter in bytes
308 mova m6, [mxq+16]
309 mova m7, [filter_h6_shuf1]
310
311 .nextrow:
312 movu m0, [srcq-2]
313 mova m1, m0
314 mova m2, m0
315 pshufb m0, m7
316 pshufb m1, m3
317 pshufb m2, m4
318 pmaddubsw m0, m5
319 pmaddubsw m1, m6
320 pmaddubsw m2, m5
321 paddw m0, m1
322 paddw m0, m2
323 pmulhrsw m0, [pw_512]
324 STORE m0, m1, %1
325
326 ; go to next line
327 add dstq, dststrideq
328 add srcq, srcstrideq
329 dec heightd ; next row
330 jg .nextrow
331 REP_RET
332 %endmacro
333
334 INIT_XMM ssse3
335 FILTER_SSSE3 put
336 FILTER_SSSE3 avg
337
338 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
339 %macro RV40_WCORE 4-5
340 movh m4, [%3 + r6 + 0]
341 movh m5, [%4 + r6 + 0]
342 %if %0 == 4
343 %define OFFSET r6 + mmsize / 2
344 %else
345 ; 8x8 block and sse2, stride was provided
346 %define OFFSET r6
347 add r6, r5
348 %endif
349 movh m6, [%3 + OFFSET]
350 movh m7, [%4 + OFFSET]
351
352 %if %1 == 0
353 ; 14bits weights
354 punpcklbw m4, m0
355 punpcklbw m5, m0
356 punpcklbw m6, m0
357 punpcklbw m7, m0
358
359 psllw m4, 7
360 psllw m5, 7
361 psllw m6, 7
362 psllw m7, 7
363 pmulhw m4, m3
364 pmulhw m5, m2
365 pmulhw m6, m3
366 pmulhw m7, m2
367
368 paddw m4, m5
369 paddw m6, m7
370 %else
371 ; 5bits weights
372 %if cpuflag(ssse3)
373 punpcklbw m4, m5
374 punpcklbw m6, m7
375
376 pmaddubsw m4, m3
377 pmaddubsw m6, m3
378 %else
379 punpcklbw m4, m0
380 punpcklbw m5, m0
381 punpcklbw m6, m0
382 punpcklbw m7, m0
383
384 pmullw m4, m3
385 pmullw m5, m2
386 pmullw m6, m3
387 pmullw m7, m2
388 paddw m4, m5
389 paddw m6, m7
390 %endif
391
392 %endif
393
394 ; bias and shift down
395 %if cpuflag(ssse3)
396 pmulhrsw m4, m1
397 pmulhrsw m6, m1
398 %else
399 paddw m4, m1
400 paddw m6, m1
401 psrlw m4, 5
402 psrlw m6, 5
403 %endif
404
405 packuswb m4, m6
406 %if %0 == 5
407 ; Only called for 8x8 blocks and sse2
408 sub r6, r5
409 movh [%2 + r6], m4
410 add r6, r5
411 movhps [%2 + r6], m4
412 %else
413 mova [%2 + r6], m4
414 %endif
415 %endmacro
416
417
418 %macro MAIN_LOOP 2
419 %if mmsize == 8
420 RV40_WCORE %2, r0, r1, r2
421 %if %1 == 16
422 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
423 %endif
424
425 ; Prepare for next loop
426 add r6, r5
427 %else
428 %ifidn %1, 8
429 RV40_WCORE %2, r0, r1, r2, r5
430 ; Prepare 2 next lines
431 add r6, r5
432 %else
433 RV40_WCORE %2, r0, r1, r2
434 ; Prepare single next line
435 add r6, r5
436 %endif
437 %endif
438
439 %endmacro
440
441 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
442 ; %1=size %2=num of xmm regs
443 ; The weights are FP0.14 notation of fractions depending on pts.
444 ; For timebases without rounding error (i.e. PAL), the fractions
445 ; can be simplified, and several operations can be avoided.
446 ; Therefore, we check here whether they are multiples of 2^9 for
447 ; those simplifications to occur.
448 %macro RV40_WEIGHT 3
449 cglobal rv40_weight_func_%1_%2, 6, 7, 8
450 %if cpuflag(ssse3)
451 mova m1, [pw_1024]
452 %else
453 mova m1, [pw_16]
454 %endif
455 pxor m0, m0
456 ; Set loop counter and increments
457 mov r6, r5
458 shl r6, %3
459 add r0, r6
460 add r1, r6
461 add r2, r6
462 neg r6
463
464 movd m2, r3d
465 movd m3, r4d
466 %ifidn %1,rnd
467 %define RND 0
468 SPLATW m2, m2
469 %else
470 %define RND 1
471 %if cpuflag(ssse3)
472 punpcklbw m3, m2
473 %else
474 SPLATW m2, m2
475 %endif
476 %endif
477 SPLATW m3, m3
478
479 .loop:
480 MAIN_LOOP %2, RND
481 jnz .loop
482 REP_RET
483 %endmacro
484
485 INIT_MMX mmxext
486 RV40_WEIGHT rnd, 8, 3
487 RV40_WEIGHT rnd, 16, 4
488 RV40_WEIGHT nornd, 8, 3
489 RV40_WEIGHT nornd, 16, 4
490
491 INIT_XMM sse2
492 RV40_WEIGHT rnd, 8, 3
493 RV40_WEIGHT rnd, 16, 4
494 RV40_WEIGHT nornd, 8, 3
495 RV40_WEIGHT nornd, 16, 4
496
497 INIT_XMM ssse3
498 RV40_WEIGHT rnd, 8, 3
499 RV40_WEIGHT rnd, 16, 4
500 RV40_WEIGHT nornd, 8, 3
501 RV40_WEIGHT nornd, 16, 4