swscale: x86: Add some forgotten 12-bit planar YUV cases
[libav.git] / libswscale / x86 / scale.asm
CommitLineData
e0c3e073 1;******************************************************************************
6ea64339 2;* x86-optimized horizontal line scaling functions
e0c3e073
RB
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
04581c8c 22%include "libavutil/x86/x86util.asm"
e0c3e073
RB
23
24SECTION_RODATA
25
26max_19bit_int: times 4 dd 0x7ffff
27max_19bit_flt: times 4 dd 524287.0
28minshort: times 8 dw 0x8000
29unicoeff: times 4 dd 0x20000000
30
31SECTION .text
32
33;-----------------------------------------------------------------------------
34; horizontal line scaling
35;
36; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
37; (SwsContext *c, int{16,32}_t *dst,
38; int dstW, const uint{8,16}_t *src,
39; const int16_t *filter,
2254b559 40; const int32_t *filterPos, int filterSize);
e0c3e073 41;
41ed7ab4 42; Scale one horizontal line. Input is either 8-bit width or 16-bit width
e0c3e073 43; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
41ed7ab4
VG
44; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
45; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
e0c3e073
RB
46; output pixel is generated from $filterSize input pixels, the position of
47; the first pixel is given in filterPos[nOutputPixel].
48;-----------------------------------------------------------------------------
49
aba7a827
RB
50; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51%macro SCALE_FUNC 6
45fdcc8e
RB
52%ifnidn %3, X
53cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54%else
729f90e2 55cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
45fdcc8e 56%endif
3b15a6d7 57%if ARCH_X86_64
45fdcc8e 58 movsxd wq, wd
2254b559
RB
59%define mov32 movsxd
60%else ; x86-32
61%define mov32 mov
e0c3e073
RB
62%endif ; x86-64
63%if %2 == 19
64%if mmsize == 8 ; mmx
65 mova m2, [max_19bit_int]
aba7a827 66%elif cpuflag(sse4)
e0c3e073
RB
67 mova m2, [max_19bit_int]
68%else ; ssse3/sse2
69 mova m2, [max_19bit_flt]
70%endif ; mmx/sse2/ssse3/sse4
71%endif ; %2 == 19
72%if %1 == 16
73 mova m6, [minshort]
74 mova m7, [unicoeff]
75%elif %1 == 8
76 pxor m3, m3
77%endif ; %1 == 8/16
78
79%if %1 == 8
80%define movlh movd
81%define movbh movh
82%define srcmul 1
83%else ; %1 == 9-16
84%define movlh movq
85%define movbh movu
86%define srcmul 2
87%endif ; %1 == 8/9-16
88
89%ifnidn %3, X
90
91 ; setup loop
92%if %3 == 8
45fdcc8e
RB
93 shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
94%define wshr 1
e0c3e073 95%else ; %3 == 4
45fdcc8e 96%define wshr 0
e0c3e073 97%endif ; %3 == 8
45fdcc8e 98 lea filterq, [filterq+wq*8]
e0c3e073 99%if %2 == 15
45fdcc8e 100 lea dstq, [dstq+wq*(2>>wshr)]
e0c3e073 101%else ; %2 == 19
45fdcc8e 102 lea dstq, [dstq+wq*(4>>wshr)]
e0c3e073 103%endif ; %2 == 15/19
45fdcc8e
RB
104 lea fltposq, [fltposq+wq*(4>>wshr)]
105 neg wq
e0c3e073
RB
106
107.loop:
108%if %3 == 4 ; filterSize == 4 scaling
109 ; load 2x4 or 4x4 source pixels into m0/m1
45fdcc8e
RB
110 mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
111 mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
112 movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
e0c3e073 113%if mmsize == 8
45fdcc8e 114 movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
e0c3e073
RB
115%else ; mmsize == 16
116%if %1 > 8
45fdcc8e 117 movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
e0c3e073 118%else ; %1 == 8
45fdcc8e 119 movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
e0c3e073 120%endif
45fdcc8e
RB
121 mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
122 mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
123 movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
e0c3e073 124%if %1 > 8
45fdcc8e 125 movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
e0c3e073 126%else ; %1 == 8
45fdcc8e 127 movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
e0c3e073
RB
128 punpckldq m0, m4
129 punpckldq m1, m5
aba7a827 130%endif ; %1 == 8
e0c3e073
RB
131%endif ; mmsize == 8/16
132%if %1 == 8
45fdcc8e
RB
133 punpcklbw m0, m3 ; byte -> word
134 punpcklbw m1, m3 ; byte -> word
e0c3e073
RB
135%endif ; %1 == 8
136
137 ; multiply with filter coefficients
138%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
139 ; add back 0x8000 * sum(coeffs) after the horizontal add
140 psubw m0, m6
141 psubw m1, m6
142%endif ; %1 == 16
45fdcc8e
RB
143 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
144 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
e0c3e073
RB
145
146 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
147%if mmsize == 8 ; mmx
148 movq m4, m0
149 punpckldq m0, m1
150 punpckhdq m4, m1
151 paddd m0, m4
aba7a827 152%elif notcpuflag(ssse3) ; sse2
e0c3e073
RB
153 mova m4, m0
154 shufps m0, m1, 10001000b
155 shufps m4, m1, 11011101b
156 paddd m0, m4
157%else ; ssse3/sse4
45fdcc8e
RB
158 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
159 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
160 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
161 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
e0c3e073
RB
162%endif ; mmx/sse2/ssse3/sse4
163%else ; %3 == 8, i.e. filterSize == 8 scaling
164 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
45fdcc8e
RB
165 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
166 mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
167 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
e0c3e073 168%if mmsize == 8
45fdcc8e
RB
169 movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
170 movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
171 movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
e0c3e073 172%else ; mmsize == 16
45fdcc8e
RB
173 movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
174 mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
175 mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
176 movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
177 movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
e0c3e073
RB
178%endif ; mmsize == 8/16
179%if %1 == 8
45fdcc8e
RB
180 punpcklbw m0, m3 ; byte -> word
181 punpcklbw m1, m3 ; byte -> word
182 punpcklbw m4, m3 ; byte -> word
183 punpcklbw m5, m3 ; byte -> word
e0c3e073
RB
184%endif ; %1 == 8
185
186 ; multiply
187%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
188 ; add back 0x8000 * sum(coeffs) after the horizontal add
189 psubw m0, m6
190 psubw m1, m6
191 psubw m4, m6
192 psubw m5, m6
193%endif ; %1 == 16
45fdcc8e
RB
194 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
195 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
196 pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
197 pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
e0c3e073
RB
198
199 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
200%if mmsize == 8
201 paddd m0, m1
202 paddd m4, m5
203 movq m1, m0
204 punpckldq m0, m4
205 punpckhdq m1, m4
206 paddd m0, m1
aba7a827 207%elif notcpuflag(ssse3) ; sse2
e0c3e073
RB
208%if %1 == 8
209%define mex m6
210%else
211%define mex m3
212%endif
213 ; emulate horizontal add as transpose + vertical add
214 mova mex, m0
215 punpckldq m0, m1
216 punpckhdq mex, m1
217 paddd m0, mex
218 mova m1, m4
219 punpckldq m4, m5
220 punpckhdq m1, m5
221 paddd m4, m1
222 mova m1, m0
223 punpcklqdq m0, m4
224 punpckhqdq m1, m4
225 paddd m0, m1
226%else ; ssse3/sse4
227 ; FIXME if we rearrange the filter in pairs of 4, we can
228 ; load pixels likewise and use 2 x paddd + phaddd instead
229 ; of 3 x phaddd here, faster on older cpus
230 phaddd m0, m1
231 phaddd m4, m5
45fdcc8e
RB
232 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
233 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
234 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
235 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
e0c3e073
RB
236%endif ; mmx/sse2/ssse3/sse4
237%endif ; %3 == 4/8
238
239%else ; %3 == X, i.e. any filterSize scaling
240
241%ifidn %4, X4
45fdcc8e 242%define dlt 4
e0c3e073 243%else ; %4 == X || %4 == X8
45fdcc8e 244%define dlt 0
e0c3e073 245%endif ; %4 ==/!= X4
3b15a6d7 246%if ARCH_X86_64
729f90e2
HG
247%define srcq r8
248%define pos1q r7
249%define srcendq r9
45fdcc8e
RB
250 movsxd fltsizeq, fltsized ; filterSize
251 lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
e0c3e073 252%else ; x86-32
45fdcc8e
RB
253%define srcq srcmemq
254%define pos1q dstq
255%define srcendq r6m
256 lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
257 mov srcendq, pos0q
e0c3e073 258%endif ; x86-32/64
45fdcc8e 259 lea fltposq, [fltposq+wq*4]
e0c3e073 260%if %2 == 15
45fdcc8e 261 lea dstq, [dstq+wq*2]
e0c3e073 262%else ; %2 == 19
45fdcc8e 263 lea dstq, [dstq+wq*4]
e0c3e073 264%endif ; %2 == 15/19
45fdcc8e
RB
265 movifnidn dstmp, dstq
266 neg wq
e0c3e073
RB
267
268.loop:
45fdcc8e
RB
269 mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
270 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
e0c3e073
RB
271 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
272 pxor m4, m4
273 pxor m5, m5
45fdcc8e 274 mov srcq, srcmemmp
e0c3e073
RB
275
276.innerloop:
277 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
45fdcc8e
RB
278 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
279 movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
e0c3e073
RB
280%if %1 == 8
281 punpcklbw m0, m3
282 punpcklbw m1, m3
283%endif ; %1 == 8
284
285 ; multiply
286%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
287 ; add back 0x8000 * sum(coeffs) after the horizontal add
288 psubw m0, m6
289 psubw m1, m6
290%endif ; %1 == 16
45fdcc8e
RB
291 pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
292 pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
e0c3e073
RB
293 paddd m4, m0
294 paddd m5, m1
45fdcc8e
RB
295 add filterq, mmsize
296 add srcq, srcmul*mmsize/2
297 cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
e0c3e073
RB
298 jl .innerloop
299
300%ifidn %4, X4
45fdcc8e
RB
301 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
302 movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
303 sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
e0c3e073 304%if %1 > 8
45fdcc8e 305 movhps m0, [srcq+(pos1q+dlt)*srcmul]
e0c3e073 306%else ; %1 == 8
45fdcc8e 307 movd m1, [srcq+(pos1q+dlt)*srcmul]
e0c3e073 308 punpckldq m0, m1
aba7a827 309%endif ; %1 == 8
e0c3e073
RB
310%if %1 == 8
311 punpcklbw m0, m3
312%endif ; %1 == 8
313%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
314 ; add back 0x8000 * sum(coeffs) after the horizontal add
315 psubw m0, m6
316%endif ; %1 == 16
45fdcc8e 317 pmaddwd m0, [filterq]
e0c3e073
RB
318%endif ; %4 == X4
319
45fdcc8e 320 lea filterq, [filterq+(fltsizeq+dlt)*2]
e0c3e073
RB
321
322%if mmsize == 8 ; mmx
323 movq m0, m4
324 punpckldq m4, m5
325 punpckhdq m0, m5
326 paddd m0, m4
327%else ; mmsize == 16
aba7a827 328%if notcpuflag(ssse3) ; sse2
e0c3e073
RB
329 mova m1, m4
330 punpcklqdq m4, m5
331 punpckhqdq m1, m5
332 paddd m4, m1
333%else ; ssse3/sse4
334 phaddd m4, m5
335%endif ; sse2/ssse3/sse4
336%ifidn %4, X4
337 paddd m4, m0
338%endif ; %3 == X4
aba7a827 339%if notcpuflag(ssse3) ; sse2
e0c3e073
RB
340 pshufd m4, m4, 11011000b
341 movhlps m0, m4
342 paddd m0, m4
343%else ; ssse3/sse4
344 phaddd m4, m4
345 SWAP 0, 4
346%endif ; sse2/ssse3/sse4
347%endif ; mmsize == 8/16
348%endif ; %3 ==/!= X
349
350%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
351 paddd m0, m7
352%endif ; %1 == 16
353
354 ; clip, store
355 psrad m0, 14 + %1 - %2
356%ifidn %3, X
45fdcc8e 357 movifnidn dstq, dstmp
e0c3e073
RB
358%endif ; %3 == X
359%if %2 == 15
360 packssdw m0, m0
361%ifnidn %3, X
45fdcc8e 362 movh [dstq+wq*(2>>wshr)], m0
e0c3e073 363%else ; %3 == X
45fdcc8e 364 movd [dstq+wq*2], m0
e0c3e073
RB
365%endif ; %3 ==/!= X
366%else ; %2 == 19
367%if mmsize == 8
368 PMINSD_MMX m0, m2, m4
aba7a827 369%elif cpuflag(sse4)
e0c3e073
RB
370 pminsd m0, m2
371%else ; sse2/ssse3
372 cvtdq2ps m0, m0
373 minps m0, m2
374 cvtps2dq m0, m0
375%endif ; mmx/sse2/ssse3/sse4
376%ifnidn %3, X
45fdcc8e 377 mova [dstq+wq*(4>>wshr)], m0
e0c3e073 378%else ; %3 == X
45fdcc8e 379 movq [dstq+wq*4], m0
e0c3e073
RB
380%endif ; %3 ==/!= X
381%endif ; %2 == 15/19
382%ifnidn %3, X
45fdcc8e
RB
383 add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
384 ; per iteration. see "shl wq,1" above as for why we do this
e0c3e073 385%else ; %3 == X
45fdcc8e 386 add wq, 2
e0c3e073
RB
387%endif ; %3 ==/!= X
388 jl .loop
e0c3e073 389 REP_RET
e0c3e073
RB
390%endmacro
391
aba7a827
RB
392; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
393%macro SCALE_FUNCS 3
394SCALE_FUNC %1, %2, 4, 4, 6, %3
395SCALE_FUNC %1, %2, 8, 8, 6, %3
e0c3e073 396%if mmsize == 8
aba7a827 397SCALE_FUNC %1, %2, X, X, 7, %3
e0c3e073 398%else
aba7a827
RB
399SCALE_FUNC %1, %2, X, X4, 7, %3
400SCALE_FUNC %1, %2, X, X8, 7, %3
e0c3e073
RB
401%endif
402%endmacro
403
aba7a827
RB
404; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
405%macro SCALE_FUNCS2 3
406%if notcpuflag(sse4)
407SCALE_FUNCS 8, 15, %1
408SCALE_FUNCS 9, 15, %2
409SCALE_FUNCS 10, 15, %2
f5975064 410SCALE_FUNCS 12, 15, %2
aba7a827 411SCALE_FUNCS 16, 15, %3
e0c3e073 412%endif ; !sse4
aba7a827
RB
413SCALE_FUNCS 8, 19, %1
414SCALE_FUNCS 9, 19, %2
415SCALE_FUNCS 10, 19, %2
f5975064 416SCALE_FUNCS 12, 19, %2
aba7a827 417SCALE_FUNCS 16, 19, %3
e0c3e073
RB
418%endmacro
419
3b15a6d7 420%if ARCH_X86_32
aba7a827
RB
421INIT_MMX mmx
422SCALE_FUNCS2 0, 0, 0
e0c3e073 423%endif
aba7a827
RB
424INIT_XMM sse2
425SCALE_FUNCS2 6, 7, 8
426INIT_XMM ssse3
427SCALE_FUNCS2 6, 6, 8
428INIT_XMM sse4
429SCALE_FUNCS2 6, 6, 8