7ec72be36cf865a6cf38e0a864a7e8b236a6c847
[libav.git] / libavcodec / x86 / rv40dsp.asm
1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6 ;*
7 ;* This file is part of Libav.
8 ;*
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
13 ;*
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
23
24 %include "libavutil/x86/x86util.asm"
25
26 SECTION_RODATA
27
28 align 16
29 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
30
31 sixtap_filter_hb_m: times 8 db 1, -5
32 times 8 db 52, 20
33 ; multiplied by 2 to have the same shift
34 times 8 db 2, -10
35 times 8 db 40, 40
36 ; back to normal
37 times 8 db 1, -5
38 times 8 db 20, 52
39
40 sixtap_filter_v_m: times 8 dw 1
41 times 8 dw -5
42 times 8 dw 52
43 times 8 dw 20
44 ; multiplied by 2 to have the same shift
45 times 8 dw 2
46 times 8 dw -10
47 times 8 dw 40
48 times 8 dw 40
49 ; back to normal
50 times 8 dw 1
51 times 8 dw -5
52 times 8 dw 20
53 times 8 dw 52
54
55 %ifdef PIC
56 %define sixtap_filter_hw picregq
57 %define sixtap_filter_hb picregq
58 %define sixtap_filter_v picregq
59 %define npicregs 1
60 %else
61 %define sixtap_filter_hw sixtap_filter_hw_m
62 %define sixtap_filter_hb sixtap_filter_hb_m
63 %define sixtap_filter_v sixtap_filter_v_m
64 %define npicregs 0
65 %endif
66
67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
70
71 cextern pw_32
72 cextern pw_16
73 cextern pw_512
74
75 SECTION .text
76
77 ;-----------------------------------------------------------------------------
78 ; subpel MC functions:
79 ;
80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
81 ; uint8_t *src, int srcstride,
82 ; int len, int m);
83 ;----------------------------------------------------------------------
84 %macro LOAD 2
85 %if WIN64
86 movsxd %1q, %1d
87 %endif
88 %ifdef PIC
89 add %1q, picregq
90 %else
91 add %1q, %2
92 %endif
93 %endmacro
94
95 %macro STORE 3
96 %ifidn %3, avg
97 movh %2, [dstq]
98 %endif
99 packuswb %1, %1
100 %ifidn %3, avg
101 %if cpuflag(3dnow)
102 pavgusb %1, %2
103 %else
104 pavgb %1, %2
105 %endif
106 %endif
107 movh [dstq], %1
108 %endmacro
109
110 %macro FILTER_V 1
111 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
112 %ifdef PIC
113 lea picregq, [sixtap_filter_v_m]
114 %endif
115 pxor m7, m7
116 LOAD my, sixtap_filter_v
117
118 ; read 5 lines
119 sub srcq, srcstrideq
120 sub srcq, srcstrideq
121 movh m0, [srcq]
122 movh m1, [srcq+srcstrideq]
123 movh m2, [srcq+srcstrideq*2]
124 lea srcq, [srcq+srcstrideq*2]
125 add srcq, srcstrideq
126 movh m3, [srcq]
127 movh m4, [srcq+srcstrideq]
128 punpcklbw m0, m7
129 punpcklbw m1, m7
130 punpcklbw m2, m7
131 punpcklbw m3, m7
132 punpcklbw m4, m7
133
134 %ifdef m8
135 mova m8, [myq+ 0]
136 mova m9, [myq+16]
137 mova m10, [myq+32]
138 mova m11, [myq+48]
139 %define COEFF05 m8
140 %define COEFF14 m9
141 %define COEFF2 m10
142 %define COEFF3 m11
143 %else
144 %define COEFF05 [myq+ 0]
145 %define COEFF14 [myq+16]
146 %define COEFF2 [myq+32]
147 %define COEFF3 [myq+48]
148 %endif
149 .nextrow:
150 mova m6, m1
151 movh m5, [srcq+2*srcstrideq] ; read new row
152 paddw m6, m4
153 punpcklbw m5, m7
154 pmullw m6, COEFF14
155 paddw m0, m5
156 pmullw m0, COEFF05
157 paddw m6, m0
158 mova m0, m1
159 paddw m6, [pw_32]
160 mova m1, m2
161 pmullw m2, COEFF2
162 paddw m6, m2
163 mova m2, m3
164 pmullw m3, COEFF3
165 paddw m6, m3
166
167 ; round/clip/store
168 mova m3, m4
169 psraw m6, 6
170 mova m4, m5
171 STORE m6, m5, %1
172
173 ; go to next line
174 add dstq, dststrideq
175 add srcq, srcstrideq
176 dec heightd ; next row
177 jg .nextrow
178 REP_RET
179 %endmacro
180
181 %macro FILTER_H 1
182 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
183 %ifdef PIC
184 lea picregq, [sixtap_filter_v_m]
185 %endif
186 pxor m7, m7
187 LOAD mx, sixtap_filter_v
188 mova m6, [pw_32]
189 %ifdef m8
190 mova m8, [mxq+ 0]
191 mova m9, [mxq+16]
192 mova m10, [mxq+32]
193 mova m11, [mxq+48]
194 %define COEFF05 m8
195 %define COEFF14 m9
196 %define COEFF2 m10
197 %define COEFF3 m11
198 %else
199 %define COEFF05 [mxq+ 0]
200 %define COEFF14 [mxq+16]
201 %define COEFF2 [mxq+32]
202 %define COEFF3 [mxq+48]
203 %endif
204 .nextrow:
205 movq m0, [srcq-2]
206 movq m5, [srcq+3]
207 movq m1, [srcq-1]
208 movq m4, [srcq+2]
209 punpcklbw m0, m7
210 punpcklbw m5, m7
211 punpcklbw m1, m7
212 punpcklbw m4, m7
213 movq m2, [srcq-0]
214 movq m3, [srcq+1]
215 paddw m0, m5
216 paddw m1, m4
217 punpcklbw m2, m7
218 punpcklbw m3, m7
219 pmullw m0, COEFF05
220 pmullw m1, COEFF14
221 pmullw m2, COEFF2
222 pmullw m3, COEFF3
223 paddw m0, m6
224 paddw m1, m2
225 paddw m0, m3
226 paddw m0, m1
227 psraw m0, 6
228 STORE m0, m1, %1
229
230 ; go to next line
231 add dstq, dststrideq
232 add srcq, srcstrideq
233 dec heightd ; next row
234 jg .nextrow
235 REP_RET
236 %endmacro
237
238 %if ARCH_X86_32
239 INIT_MMX mmx
240 FILTER_V put
241 FILTER_H put
242
243 INIT_MMX mmxext
244 FILTER_V avg
245 FILTER_H avg
246
247 INIT_MMX 3dnow
248 FILTER_V avg
249 FILTER_H avg
250 %endif
251
252 INIT_XMM sse2
253 FILTER_H put
254 FILTER_H avg
255 FILTER_V put
256 FILTER_V avg
257
258 %macro FILTER_SSSE3 1
259 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
260 %ifdef PIC
261 lea picregq, [sixtap_filter_hb_m]
262 %endif
263
264 ; read 5 lines
265 sub srcq, srcstrideq
266 LOAD my, sixtap_filter_hb
267 sub srcq, srcstrideq
268 movh m0, [srcq]
269 movh m1, [srcq+srcstrideq]
270 movh m2, [srcq+srcstrideq*2]
271 lea srcq, [srcq+srcstrideq*2]
272 add srcq, srcstrideq
273 mova m5, [myq]
274 movh m3, [srcq]
275 movh m4, [srcq+srcstrideq]
276 lea srcq, [srcq+2*srcstrideq]
277
278 .nextrow:
279 mova m6, m2
280 punpcklbw m0, m1
281 punpcklbw m6, m3
282 pmaddubsw m0, m5
283 pmaddubsw m6, [myq+16]
284 movh m7, [srcq] ; read new row
285 paddw m6, m0
286 mova m0, m1
287 mova m1, m2
288 mova m2, m3
289 mova m3, m4
290 mova m4, m7
291 punpcklbw m7, m3
292 pmaddubsw m7, m5
293 paddw m6, m7
294 pmulhrsw m6, [pw_512]
295 STORE m6, m7, %1
296
297 ; go to next line
298 add dstq, dststrideq
299 add srcq, srcstrideq
300 dec heightd ; next row
301 jg .nextrow
302 REP_RET
303
304 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
305 %ifdef PIC
306 lea picregq, [sixtap_filter_hb_m]
307 %endif
308 mova m3, [filter_h6_shuf2]
309 mova m4, [filter_h6_shuf3]
310 LOAD mx, sixtap_filter_hb
311 mova m5, [mxq] ; set up 6tap filter in bytes
312 mova m6, [mxq+16]
313 mova m7, [filter_h6_shuf1]
314
315 .nextrow:
316 movu m0, [srcq-2]
317 mova m1, m0
318 mova m2, m0
319 pshufb m0, m7
320 pshufb m1, m3
321 pshufb m2, m4
322 pmaddubsw m0, m5
323 pmaddubsw m1, m6
324 pmaddubsw m2, m5
325 paddw m0, m1
326 paddw m0, m2
327 pmulhrsw m0, [pw_512]
328 STORE m0, m1, %1
329
330 ; go to next line
331 add dstq, dststrideq
332 add srcq, srcstrideq
333 dec heightd ; next row
334 jg .nextrow
335 REP_RET
336 %endmacro
337
338 INIT_XMM ssse3
339 FILTER_SSSE3 put
340 FILTER_SSSE3 avg
341
342 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
343 %macro RV40_WCORE 4-5
344 movh m4, [%3 + r6 + 0]
345 movh m5, [%4 + r6 + 0]
346 %if %0 == 4
347 %define OFFSET r6 + mmsize / 2
348 %else
349 ; 8x8 block and sse2, stride was provided
350 %define OFFSET r6
351 add r6, r5
352 %endif
353 movh m6, [%3 + OFFSET]
354 movh m7, [%4 + OFFSET]
355
356 %if %1 == 0
357 ; 14bits weights
358 punpcklbw m4, m0
359 punpcklbw m5, m0
360 punpcklbw m6, m0
361 punpcklbw m7, m0
362
363 psllw m4, 7
364 psllw m5, 7
365 psllw m6, 7
366 psllw m7, 7
367 pmulhw m4, m3
368 pmulhw m5, m2
369 pmulhw m6, m3
370 pmulhw m7, m2
371
372 paddw m4, m5
373 paddw m6, m7
374 %else
375 ; 5bits weights
376 %if cpuflag(ssse3)
377 punpcklbw m4, m5
378 punpcklbw m6, m7
379
380 pmaddubsw m4, m3
381 pmaddubsw m6, m3
382 %else
383 punpcklbw m4, m0
384 punpcklbw m5, m0
385 punpcklbw m6, m0
386 punpcklbw m7, m0
387
388 pmullw m4, m3
389 pmullw m5, m2
390 pmullw m6, m3
391 pmullw m7, m2
392 paddw m4, m5
393 paddw m6, m7
394 %endif
395
396 %endif
397
398 ; bias and shift down
399 %if cpuflag(ssse3)
400 pmulhrsw m4, m1
401 pmulhrsw m6, m1
402 %else
403 paddw m4, m1
404 paddw m6, m1
405 psrlw m4, 5
406 psrlw m6, 5
407 %endif
408
409 packuswb m4, m6
410 %if %0 == 5
411 ; Only called for 8x8 blocks and sse2
412 sub r6, r5
413 movh [%2 + r6], m4
414 add r6, r5
415 movhps [%2 + r6], m4
416 %else
417 mova [%2 + r6], m4
418 %endif
419 %endmacro
420
421
422 %macro MAIN_LOOP 2
423 %if mmsize == 8
424 RV40_WCORE %2, r0, r1, r2
425 %if %1 == 16
426 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
427 %endif
428
429 ; Prepare for next loop
430 add r6, r5
431 %else
432 %ifidn %1, 8
433 RV40_WCORE %2, r0, r1, r2, r5
434 ; Prepare 2 next lines
435 add r6, r5
436 %else
437 RV40_WCORE %2, r0, r1, r2
438 ; Prepare single next line
439 add r6, r5
440 %endif
441 %endif
442
443 %endmacro
444
445 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
446 ; %1=size %2=num of xmm regs
447 ; The weights are FP0.14 notation of fractions depending on pts.
448 ; For timebases without rounding error (i.e. PAL), the fractions
449 ; can be simplified, and several operations can be avoided.
450 ; Therefore, we check here whether they are multiples of 2^9 for
451 ; those simplifications to occur.
452 %macro RV40_WEIGHT 3
453 cglobal rv40_weight_func_%1_%2, 6, 7, 8
454 %if cpuflag(ssse3)
455 mova m1, [pw_1024]
456 %else
457 mova m1, [pw_16]
458 %endif
459 pxor m0, m0
460 ; Set loop counter and increments
461 mov r6, r5
462 shl r6, %3
463 add r0, r6
464 add r1, r6
465 add r2, r6
466 neg r6
467
468 movd m2, r3d
469 movd m3, r4d
470 %ifidn %1,rnd
471 %define RND 0
472 SPLATW m2, m2
473 %else
474 %define RND 1
475 %if cpuflag(ssse3)
476 punpcklbw m3, m2
477 %else
478 SPLATW m2, m2
479 %endif
480 %endif
481 SPLATW m3, m3
482
483 .loop:
484 MAIN_LOOP %2, RND
485 jnz .loop
486 REP_RET
487 %endmacro
488
489 INIT_MMX mmxext
490 RV40_WEIGHT rnd, 8, 3
491 RV40_WEIGHT rnd, 16, 4
492 RV40_WEIGHT nornd, 8, 3
493 RV40_WEIGHT nornd, 16, 4
494
495 INIT_XMM sse2
496 RV40_WEIGHT rnd, 8, 3
497 RV40_WEIGHT rnd, 16, 4
498 RV40_WEIGHT nornd, 8, 3
499 RV40_WEIGHT nornd, 16, 4
500
501 INIT_XMM ssse3
502 RV40_WEIGHT rnd, 8, 3
503 RV40_WEIGHT rnd, 16, 4
504 RV40_WEIGHT nornd, 8, 3
505 RV40_WEIGHT nornd, 16, 4