swscale: x86: Add some forgotten 12-bit planar YUV cases
[libav.git] / libswscale / x86 / scale.asm
1 ;******************************************************************************
2 ;* x86-optimized horizontal line scaling functions
3 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "libavutil/x86/x86util.asm"
23
24 SECTION_RODATA
25
26 max_19bit_int: times 4 dd 0x7ffff
27 max_19bit_flt: times 4 dd 524287.0
28 minshort: times 8 dw 0x8000
29 unicoeff: times 4 dd 0x20000000
30
31 SECTION .text
32
33 ;-----------------------------------------------------------------------------
34 ; horizontal line scaling
35 ;
36 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
37 ; (SwsContext *c, int{16,32}_t *dst,
38 ; int dstW, const uint{8,16}_t *src,
39 ; const int16_t *filter,
40 ; const int32_t *filterPos, int filterSize);
41 ;
42 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
43 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
44 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
45 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
46 ; output pixel is generated from $filterSize input pixels, the position of
47 ; the first pixel is given in filterPos[nOutputPixel].
48 ;-----------------------------------------------------------------------------
49
50 ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51 %macro SCALE_FUNC 6
52 %ifnidn %3, X
53 cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54 %else
55 cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
56 %endif
57 %if ARCH_X86_64
58 movsxd wq, wd
59 %define mov32 movsxd
60 %else ; x86-32
61 %define mov32 mov
62 %endif ; x86-64
63 %if %2 == 19
64 %if mmsize == 8 ; mmx
65 mova m2, [max_19bit_int]
66 %elif cpuflag(sse4)
67 mova m2, [max_19bit_int]
68 %else ; ssse3/sse2
69 mova m2, [max_19bit_flt]
70 %endif ; mmx/sse2/ssse3/sse4
71 %endif ; %2 == 19
72 %if %1 == 16
73 mova m6, [minshort]
74 mova m7, [unicoeff]
75 %elif %1 == 8
76 pxor m3, m3
77 %endif ; %1 == 8/16
78
79 %if %1 == 8
80 %define movlh movd
81 %define movbh movh
82 %define srcmul 1
83 %else ; %1 == 9-16
84 %define movlh movq
85 %define movbh movu
86 %define srcmul 2
87 %endif ; %1 == 8/9-16
88
89 %ifnidn %3, X
90
91 ; setup loop
92 %if %3 == 8
93 shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
94 %define wshr 1
95 %else ; %3 == 4
96 %define wshr 0
97 %endif ; %3 == 8
98 lea filterq, [filterq+wq*8]
99 %if %2 == 15
100 lea dstq, [dstq+wq*(2>>wshr)]
101 %else ; %2 == 19
102 lea dstq, [dstq+wq*(4>>wshr)]
103 %endif ; %2 == 15/19
104 lea fltposq, [fltposq+wq*(4>>wshr)]
105 neg wq
106
107 .loop:
108 %if %3 == 4 ; filterSize == 4 scaling
109 ; load 2x4 or 4x4 source pixels into m0/m1
110 mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
111 mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
112 movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
113 %if mmsize == 8
114 movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
115 %else ; mmsize == 16
116 %if %1 > 8
117 movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
118 %else ; %1 == 8
119 movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
120 %endif
121 mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
122 mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
123 movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
124 %if %1 > 8
125 movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
126 %else ; %1 == 8
127 movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
128 punpckldq m0, m4
129 punpckldq m1, m5
130 %endif ; %1 == 8
131 %endif ; mmsize == 8/16
132 %if %1 == 8
133 punpcklbw m0, m3 ; byte -> word
134 punpcklbw m1, m3 ; byte -> word
135 %endif ; %1 == 8
136
137 ; multiply with filter coefficients
138 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
139 ; add back 0x8000 * sum(coeffs) after the horizontal add
140 psubw m0, m6
141 psubw m1, m6
142 %endif ; %1 == 16
143 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
144 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
145
146 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
147 %if mmsize == 8 ; mmx
148 movq m4, m0
149 punpckldq m0, m1
150 punpckhdq m4, m1
151 paddd m0, m4
152 %elif notcpuflag(ssse3) ; sse2
153 mova m4, m0
154 shufps m0, m1, 10001000b
155 shufps m4, m1, 11011101b
156 paddd m0, m4
157 %else ; ssse3/sse4
158 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
159 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
160 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
161 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
162 %endif ; mmx/sse2/ssse3/sse4
163 %else ; %3 == 8, i.e. filterSize == 8 scaling
164 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
165 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
166 mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
167 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
168 %if mmsize == 8
169 movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
170 movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
171 movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
172 %else ; mmsize == 16
173 movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
174 mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
175 mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
176 movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
177 movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
178 %endif ; mmsize == 8/16
179 %if %1 == 8
180 punpcklbw m0, m3 ; byte -> word
181 punpcklbw m1, m3 ; byte -> word
182 punpcklbw m4, m3 ; byte -> word
183 punpcklbw m5, m3 ; byte -> word
184 %endif ; %1 == 8
185
186 ; multiply
187 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
188 ; add back 0x8000 * sum(coeffs) after the horizontal add
189 psubw m0, m6
190 psubw m1, m6
191 psubw m4, m6
192 psubw m5, m6
193 %endif ; %1 == 16
194 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
195 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
196 pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
197 pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
198
199 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
200 %if mmsize == 8
201 paddd m0, m1
202 paddd m4, m5
203 movq m1, m0
204 punpckldq m0, m4
205 punpckhdq m1, m4
206 paddd m0, m1
207 %elif notcpuflag(ssse3) ; sse2
208 %if %1 == 8
209 %define mex m6
210 %else
211 %define mex m3
212 %endif
213 ; emulate horizontal add as transpose + vertical add
214 mova mex, m0
215 punpckldq m0, m1
216 punpckhdq mex, m1
217 paddd m0, mex
218 mova m1, m4
219 punpckldq m4, m5
220 punpckhdq m1, m5
221 paddd m4, m1
222 mova m1, m0
223 punpcklqdq m0, m4
224 punpckhqdq m1, m4
225 paddd m0, m1
226 %else ; ssse3/sse4
227 ; FIXME if we rearrange the filter in pairs of 4, we can
228 ; load pixels likewise and use 2 x paddd + phaddd instead
229 ; of 3 x phaddd here, faster on older cpus
230 phaddd m0, m1
231 phaddd m4, m5
232 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
233 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
234 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
235 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
236 %endif ; mmx/sse2/ssse3/sse4
237 %endif ; %3 == 4/8
238
239 %else ; %3 == X, i.e. any filterSize scaling
240
241 %ifidn %4, X4
242 %define dlt 4
243 %else ; %4 == X || %4 == X8
244 %define dlt 0
245 %endif ; %4 ==/!= X4
246 %if ARCH_X86_64
247 %define srcq r8
248 %define pos1q r7
249 %define srcendq r9
250 movsxd fltsizeq, fltsized ; filterSize
251 lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
252 %else ; x86-32
253 %define srcq srcmemq
254 %define pos1q dstq
255 %define srcendq r6m
256 lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
257 mov srcendq, pos0q
258 %endif ; x86-32/64
259 lea fltposq, [fltposq+wq*4]
260 %if %2 == 15
261 lea dstq, [dstq+wq*2]
262 %else ; %2 == 19
263 lea dstq, [dstq+wq*4]
264 %endif ; %2 == 15/19
265 movifnidn dstmp, dstq
266 neg wq
267
268 .loop:
269 mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
270 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
271 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
272 pxor m4, m4
273 pxor m5, m5
274 mov srcq, srcmemmp
275
276 .innerloop:
277 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
278 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
279 movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
280 %if %1 == 8
281 punpcklbw m0, m3
282 punpcklbw m1, m3
283 %endif ; %1 == 8
284
285 ; multiply
286 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
287 ; add back 0x8000 * sum(coeffs) after the horizontal add
288 psubw m0, m6
289 psubw m1, m6
290 %endif ; %1 == 16
291 pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
292 pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
293 paddd m4, m0
294 paddd m5, m1
295 add filterq, mmsize
296 add srcq, srcmul*mmsize/2
297 cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
298 jl .innerloop
299
300 %ifidn %4, X4
301 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
302 movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
303 sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
304 %if %1 > 8
305 movhps m0, [srcq+(pos1q+dlt)*srcmul]
306 %else ; %1 == 8
307 movd m1, [srcq+(pos1q+dlt)*srcmul]
308 punpckldq m0, m1
309 %endif ; %1 == 8
310 %if %1 == 8
311 punpcklbw m0, m3
312 %endif ; %1 == 8
313 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
314 ; add back 0x8000 * sum(coeffs) after the horizontal add
315 psubw m0, m6
316 %endif ; %1 == 16
317 pmaddwd m0, [filterq]
318 %endif ; %4 == X4
319
320 lea filterq, [filterq+(fltsizeq+dlt)*2]
321
322 %if mmsize == 8 ; mmx
323 movq m0, m4
324 punpckldq m4, m5
325 punpckhdq m0, m5
326 paddd m0, m4
327 %else ; mmsize == 16
328 %if notcpuflag(ssse3) ; sse2
329 mova m1, m4
330 punpcklqdq m4, m5
331 punpckhqdq m1, m5
332 paddd m4, m1
333 %else ; ssse3/sse4
334 phaddd m4, m5
335 %endif ; sse2/ssse3/sse4
336 %ifidn %4, X4
337 paddd m4, m0
338 %endif ; %3 == X4
339 %if notcpuflag(ssse3) ; sse2
340 pshufd m4, m4, 11011000b
341 movhlps m0, m4
342 paddd m0, m4
343 %else ; ssse3/sse4
344 phaddd m4, m4
345 SWAP 0, 4
346 %endif ; sse2/ssse3/sse4
347 %endif ; mmsize == 8/16
348 %endif ; %3 ==/!= X
349
350 %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
351 paddd m0, m7
352 %endif ; %1 == 16
353
354 ; clip, store
355 psrad m0, 14 + %1 - %2
356 %ifidn %3, X
357 movifnidn dstq, dstmp
358 %endif ; %3 == X
359 %if %2 == 15
360 packssdw m0, m0
361 %ifnidn %3, X
362 movh [dstq+wq*(2>>wshr)], m0
363 %else ; %3 == X
364 movd [dstq+wq*2], m0
365 %endif ; %3 ==/!= X
366 %else ; %2 == 19
367 %if mmsize == 8
368 PMINSD_MMX m0, m2, m4
369 %elif cpuflag(sse4)
370 pminsd m0, m2
371 %else ; sse2/ssse3
372 cvtdq2ps m0, m0
373 minps m0, m2
374 cvtps2dq m0, m0
375 %endif ; mmx/sse2/ssse3/sse4
376 %ifnidn %3, X
377 mova [dstq+wq*(4>>wshr)], m0
378 %else ; %3 == X
379 movq [dstq+wq*4], m0
380 %endif ; %3 ==/!= X
381 %endif ; %2 == 15/19
382 %ifnidn %3, X
383 add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
384 ; per iteration. see "shl wq,1" above as for why we do this
385 %else ; %3 == X
386 add wq, 2
387 %endif ; %3 ==/!= X
388 jl .loop
389 REP_RET
390 %endmacro
391
392 ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
393 %macro SCALE_FUNCS 3
394 SCALE_FUNC %1, %2, 4, 4, 6, %3
395 SCALE_FUNC %1, %2, 8, 8, 6, %3
396 %if mmsize == 8
397 SCALE_FUNC %1, %2, X, X, 7, %3
398 %else
399 SCALE_FUNC %1, %2, X, X4, 7, %3
400 SCALE_FUNC %1, %2, X, X8, 7, %3
401 %endif
402 %endmacro
403
404 ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
405 %macro SCALE_FUNCS2 3
406 %if notcpuflag(sse4)
407 SCALE_FUNCS 8, 15, %1
408 SCALE_FUNCS 9, 15, %2
409 SCALE_FUNCS 10, 15, %2
410 SCALE_FUNCS 12, 15, %2
411 SCALE_FUNCS 16, 15, %3
412 %endif ; !sse4
413 SCALE_FUNCS 8, 19, %1
414 SCALE_FUNCS 9, 19, %2
415 SCALE_FUNCS 10, 19, %2
416 SCALE_FUNCS 12, 19, %2
417 SCALE_FUNCS 16, 19, %3
418 %endmacro
419
420 %if ARCH_X86_32
421 INIT_MMX mmx
422 SCALE_FUNCS2 0, 0, 0
423 %endif
424 INIT_XMM sse2
425 SCALE_FUNCS2 6, 7, 8
426 INIT_XMM ssse3
427 SCALE_FUNCS2 6, 6, 8
428 INIT_XMM sse4
429 SCALE_FUNCS2 6, 6, 8