h264: stricter reference limit enforcement.
[libav.git] / libswscale / x86 / scale.asm
CommitLineData
e0c3e073 1;******************************************************************************
6ea64339 2;* x86-optimized horizontal line scaling functions
e0c3e073
RB
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_RODATA
26
27max_19bit_int: times 4 dd 0x7ffff
28max_19bit_flt: times 4 dd 524287.0
29minshort: times 8 dw 0x8000
30unicoeff: times 4 dd 0x20000000
31
32SECTION .text
33
34;-----------------------------------------------------------------------------
35; horizontal line scaling
36;
37; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
38; (SwsContext *c, int{16,32}_t *dst,
39; int dstW, const uint{8,16}_t *src,
40; const int16_t *filter,
2254b559 41; const int32_t *filterPos, int filterSize);
e0c3e073
RB
42;
43; Scale one horizontal line. Input is either 8-bits width or 16-bits width
44; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
45; downscale before multiplying). Filter is 14-bits. Output is either 15bits
46; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
47; output pixel is generated from $filterSize input pixels, the position of
48; the first pixel is given in filterPos[nOutputPixel].
49;-----------------------------------------------------------------------------
50
51; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm
52%macro SCALE_FUNC 7
53cglobal hscale%1to%2_%4_%5, %6, 7, %7
3b15a6d7 54%if ARCH_X86_64
e0c3e073 55 movsxd r2, r2d
2254b559
RB
56%define mov32 movsxd
57%else ; x86-32
58%define mov32 mov
e0c3e073
RB
59%endif ; x86-64
60%if %2 == 19
61%if mmsize == 8 ; mmx
62 mova m2, [max_19bit_int]
63%elifidn %5, sse4
64 mova m2, [max_19bit_int]
65%else ; ssse3/sse2
66 mova m2, [max_19bit_flt]
67%endif ; mmx/sse2/ssse3/sse4
68%endif ; %2 == 19
69%if %1 == 16
70 mova m6, [minshort]
71 mova m7, [unicoeff]
72%elif %1 == 8
73 pxor m3, m3
74%endif ; %1 == 8/16
75
76%if %1 == 8
77%define movlh movd
78%define movbh movh
79%define srcmul 1
80%else ; %1 == 9-16
81%define movlh movq
82%define movbh movu
83%define srcmul 2
84%endif ; %1 == 8/9-16
85
86%ifnidn %3, X
87
88 ; setup loop
89%if %3 == 8
90 shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
91%define r2shr 1
92%else ; %3 == 4
93%define r2shr 0
94%endif ; %3 == 8
95 lea r4, [r4+r2*8]
96%if %2 == 15
97 lea r1, [r1+r2*(2>>r2shr)]
98%else ; %2 == 19
99 lea r1, [r1+r2*(4>>r2shr)]
100%endif ; %2 == 15/19
2254b559 101 lea r5, [r5+r2*(4>>r2shr)]
e0c3e073
RB
102 neg r2
103
104.loop:
105%if %3 == 4 ; filterSize == 4 scaling
106 ; load 2x4 or 4x4 source pixels into m0/m1
2254b559
RB
107 mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
108 mov32 r6, dword [r5+r2*4+4] ; filterPos[1]
e0c3e073
RB
109 movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}]
110%if mmsize == 8
111 movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
112%else ; mmsize == 16
113%if %1 > 8
114 movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
115%else ; %1 == 8
116 movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
117%endif
2254b559
RB
118 mov32 r0, dword [r5+r2*4+8] ; filterPos[2]
119 mov32 r6, dword [r5+r2*4+12] ; filterPos[3]
e0c3e073
RB
120 movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}]
121%if %1 > 8
122 movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
123%else ; %1 == 8
124 movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
125 punpckldq m0, m4
126 punpckldq m1, m5
127%endif ; %1 == 8 && %5 <= ssse
128%endif ; mmsize == 8/16
129%if %1 == 8
130 punpcklbw m0, m3 ; byte -> word
131 punpcklbw m1, m3 ; byte -> word
132%endif ; %1 == 8
133
134 ; multiply with filter coefficients
135%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
136 ; add back 0x8000 * sum(coeffs) after the horizontal add
137 psubw m0, m6
138 psubw m1, m6
139%endif ; %1 == 16
140 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
141 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
142
143 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
144%if mmsize == 8 ; mmx
145 movq m4, m0
146 punpckldq m0, m1
147 punpckhdq m4, m1
148 paddd m0, m4
149%elifidn %5, sse2
150 mova m4, m0
151 shufps m0, m1, 10001000b
152 shufps m4, m1, 11011101b
153 paddd m0, m4
154%else ; ssse3/sse4
155 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
156 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
157 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
158 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
159%endif ; mmx/sse2/ssse3/sse4
160%else ; %3 == 8, i.e. filterSize == 8 scaling
161 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
2254b559
RB
162 mov32 r0, dword [r5+r2*2+0] ; filterPos[0]
163 mov32 r6, dword [r5+r2*2+4] ; filterPos[1]
e0c3e073
RB
164 movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
165%if mmsize == 8
166 movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
167 movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}]
168 movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
169%else ; mmsize == 16
170 movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
2254b559
RB
171 mov32 r0, dword [r5+r2*2+8] ; filterPos[2]
172 mov32 r6, dword [r5+r2*2+12] ; filterPos[3]
e0c3e073
RB
173 movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
174 movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
175%endif ; mmsize == 8/16
176%if %1 == 8
177 punpcklbw m0, m3 ; byte -> word
178 punpcklbw m1, m3 ; byte -> word
179 punpcklbw m4, m3 ; byte -> word
180 punpcklbw m5, m3 ; byte -> word
181%endif ; %1 == 8
182
183 ; multiply
184%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
185 ; add back 0x8000 * sum(coeffs) after the horizontal add
186 psubw m0, m6
187 psubw m1, m6
188 psubw m4, m6
189 psubw m5, m6
190%endif ; %1 == 16
191 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
192 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
193 pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
194 pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
195
196 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
197%if mmsize == 8
198 paddd m0, m1
199 paddd m4, m5
200 movq m1, m0
201 punpckldq m0, m4
202 punpckhdq m1, m4
203 paddd m0, m1
204%elifidn %5, sse2
205%if %1 == 8
206%define mex m6
207%else
208%define mex m3
209%endif
210 ; emulate horizontal add as transpose + vertical add
211 mova mex, m0
212 punpckldq m0, m1
213 punpckhdq mex, m1
214 paddd m0, mex
215 mova m1, m4
216 punpckldq m4, m5
217 punpckhdq m1, m5
218 paddd m4, m1
219 mova m1, m0
220 punpcklqdq m0, m4
221 punpckhqdq m1, m4
222 paddd m0, m1
223%else ; ssse3/sse4
224 ; FIXME if we rearrange the filter in pairs of 4, we can
225 ; load pixels likewise and use 2 x paddd + phaddd instead
226 ; of 3 x phaddd here, faster on older cpus
227 phaddd m0, m1
228 phaddd m4, m5
229 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
230 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
231 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
232 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
233%endif ; mmx/sse2/ssse3/sse4
234%endif ; %3 == 4/8
235
236%else ; %3 == X, i.e. any filterSize scaling
237
238%ifidn %4, X4
239%define r6sub 4
240%else ; %4 == X || %4 == X8
241%define r6sub 0
242%endif ; %4 ==/!= X4
3b15a6d7 243%if ARCH_X86_64
e0c3e073
RB
244 push r12
245 movsxd r6, r6d ; filterSize
246 lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
247%define src_reg r11
248%define r1x r10
249%define filter2 r12
250%else ; x86-32
251 lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
252 mov r6m, r0
253%define src_reg r3
254%define r1x r1
255%define filter2 r6m
256%endif ; x86-32/64
2254b559 257 lea r5, [r5+r2*4]
e0c3e073
RB
258%if %2 == 15
259 lea r1, [r1+r2*2]
260%else ; %2 == 19
261 lea r1, [r1+r2*4]
262%endif ; %2 == 15/19
263 movifnidn r1mp, r1
264 neg r2
265
266.loop:
2254b559
RB
267 mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
268 mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
e0c3e073
RB
269 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
270 pxor m4, m4
271 pxor m5, m5
272 mov src_reg, r3mp
273
274.innerloop:
275 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
276 movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
277 movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
278%if %1 == 8
279 punpcklbw m0, m3
280 punpcklbw m1, m3
281%endif ; %1 == 8
282
283 ; multiply
284%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
285 ; add back 0x8000 * sum(coeffs) after the horizontal add
286 psubw m0, m6
287 psubw m1, m6
288%endif ; %1 == 16
289 pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}]
290 pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
291 paddd m4, m0
292 paddd m5, m1
293 add r4, mmsize
294 add src_reg, srcmul*mmsize/2
295 cmp src_reg, filter2 ; while (src += 4) < &src[filterSize]
296 jl .innerloop
297
298%ifidn %4, X4
2254b559 299 mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
e0c3e073
RB
300 movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0]
301 sub r1x, r6 ; and first 4 srcpx of dstpx[1]
302%if %1 > 8
303 movhps m0, [src_reg+(r1x+r6sub)*srcmul]
304%else ; %1 == 8
305 movd m1, [src_reg+(r1x+r6sub)*srcmul]
306 punpckldq m0, m1
307%endif ; %1 == 8 && %5 <= ssse
308%if %1 == 8
309 punpcklbw m0, m3
310%endif ; %1 == 8
311%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
312 ; add back 0x8000 * sum(coeffs) after the horizontal add
313 psubw m0, m6
314%endif ; %1 == 16
315 pmaddwd m0, [r4]
316%endif ; %4 == X4
317
318 lea r4, [r4+(r6+r6sub)*2]
319
320%if mmsize == 8 ; mmx
321 movq m0, m4
322 punpckldq m4, m5
323 punpckhdq m0, m5
324 paddd m0, m4
325%else ; mmsize == 16
326%ifidn %5, sse2
327 mova m1, m4
328 punpcklqdq m4, m5
329 punpckhqdq m1, m5
330 paddd m4, m1
331%else ; ssse3/sse4
332 phaddd m4, m5
333%endif ; sse2/ssse3/sse4
334%ifidn %4, X4
335 paddd m4, m0
336%endif ; %3 == X4
337%ifidn %5, sse2
338 pshufd m4, m4, 11011000b
339 movhlps m0, m4
340 paddd m0, m4
341%else ; ssse3/sse4
342 phaddd m4, m4
343 SWAP 0, 4
344%endif ; sse2/ssse3/sse4
345%endif ; mmsize == 8/16
346%endif ; %3 ==/!= X
347
348%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
349 paddd m0, m7
350%endif ; %1 == 16
351
352 ; clip, store
353 psrad m0, 14 + %1 - %2
354%ifidn %3, X
355 movifnidn r1, r1mp
356%endif ; %3 == X
357%if %2 == 15
358 packssdw m0, m0
359%ifnidn %3, X
360 movh [r1+r2*(2>>r2shr)], m0
361%else ; %3 == X
362 movd [r1+r2*2], m0
363%endif ; %3 ==/!= X
364%else ; %2 == 19
365%if mmsize == 8
366 PMINSD_MMX m0, m2, m4
367%elifidn %5, sse4
368 pminsd m0, m2
369%else ; sse2/ssse3
370 cvtdq2ps m0, m0
371 minps m0, m2
372 cvtps2dq m0, m0
373%endif ; mmx/sse2/ssse3/sse4
374%ifnidn %3, X
6aa3cac6 375 mova [r1+r2*(4>>r2shr)], m0
e0c3e073
RB
376%else ; %3 == X
377 movq [r1+r2*4], m0
378%endif ; %3 ==/!= X
379%endif ; %2 == 15/19
380%ifnidn %3, X
381 add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
382 ; per iteration. see "shl r2,1" above as for why we do this
383%else ; %3 == X
384 add r2, 2
385%endif ; %3 ==/!= X
386 jl .loop
387%ifnidn %3, X
388 REP_RET
389%else ; %3 == X
3b15a6d7 390%if ARCH_X86_64
e0c3e073
RB
391 pop r12
392 RET
393%else ; x86-32
394 REP_RET
395%endif ; x86-32/64
396%endif ; %3 ==/!= X
397%endmacro
398
399; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm
400%macro SCALE_FUNCS 4
401SCALE_FUNC %1, %2, 4, 4, %3, 6, %4
402SCALE_FUNC %1, %2, 8, 8, %3, 6, %4
403%if mmsize == 8
404SCALE_FUNC %1, %2, X, X, %3, 7, %4
405%else
406SCALE_FUNC %1, %2, X, X4, %3, 7, %4
407SCALE_FUNC %1, %2, X, X8, %3, 7, %4
408%endif
409%endmacro
410
411; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args
412%macro SCALE_FUNCS2 4
413%ifnidn %1, sse4
414SCALE_FUNCS 8, 15, %1, %2
415SCALE_FUNCS 9, 15, %1, %3
416SCALE_FUNCS 10, 15, %1, %3
417SCALE_FUNCS 16, 15, %1, %4
418%endif ; !sse4
419SCALE_FUNCS 8, 19, %1, %2
420SCALE_FUNCS 9, 19, %1, %3
421SCALE_FUNCS 10, 19, %1, %3
422SCALE_FUNCS 16, 19, %1, %4
423%endmacro
424
3b15a6d7 425%if ARCH_X86_32
e0c3e073
RB
426INIT_MMX
427SCALE_FUNCS2 mmx, 0, 0, 0
428%endif
429INIT_XMM
430SCALE_FUNCS2 sse2, 6, 7, 8
431SCALE_FUNCS2 ssse3, 6, 6, 8
432SCALE_FUNCS2 sse4, 6, 6, 8