sws: implement MMX/SSE2/SSSE3/SSE4 versions for horizontal scaling.
[libav.git] / libswscale / x86 / scale.asm
CommitLineData
e0c3e073
RB
1;******************************************************************************
2;* x86-optimized horizontal line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_RODATA
26
27max_19bit_int: times 4 dd 0x7ffff
28max_19bit_flt: times 4 dd 524287.0
29minshort: times 8 dw 0x8000
30unicoeff: times 4 dd 0x20000000
31
32SECTION .text
33
34;-----------------------------------------------------------------------------
35; horizontal line scaling
36;
37; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
38; (SwsContext *c, int{16,32}_t *dst,
39; int dstW, const uint{8,16}_t *src,
40; const int16_t *filter,
41; const int16_t *filterPos, int filterSize);
42;
43; Scale one horizontal line. Input is either 8-bits width or 16-bits width
44; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
45; downscale before multiplying). Filter is 14-bits. Output is either 15bits
46; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
47; output pixel is generated from $filterSize input pixels, the position of
48; the first pixel is given in filterPos[nOutputPixel].
49;-----------------------------------------------------------------------------
50
51; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm
52%macro SCALE_FUNC 7
53cglobal hscale%1to%2_%4_%5, %6, 7, %7
54%ifdef ARCH_X86_64
55 movsxd r2, r2d
56%endif ; x86-64
57%if %2 == 19
58%if mmsize == 8 ; mmx
59 mova m2, [max_19bit_int]
60%elifidn %5, sse4
61 mova m2, [max_19bit_int]
62%else ; ssse3/sse2
63 mova m2, [max_19bit_flt]
64%endif ; mmx/sse2/ssse3/sse4
65%endif ; %2 == 19
66%if %1 == 16
67 mova m6, [minshort]
68 mova m7, [unicoeff]
69%elif %1 == 8
70 pxor m3, m3
71%endif ; %1 == 8/16
72
73%if %1 == 8
74%define movlh movd
75%define movbh movh
76%define srcmul 1
77%else ; %1 == 9-16
78%define movlh movq
79%define movbh movu
80%define srcmul 2
81%endif ; %1 == 8/9-16
82
83%ifnidn %3, X
84
85 ; setup loop
86%if %3 == 8
87 shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
88%define r2shr 1
89%else ; %3 == 4
90%define r2shr 0
91%endif ; %3 == 8
92 lea r4, [r4+r2*8]
93%if %2 == 15
94 lea r1, [r1+r2*(2>>r2shr)]
95%else ; %2 == 19
96 lea r1, [r1+r2*(4>>r2shr)]
97%endif ; %2 == 15/19
98 lea r5, [r5+r2*(2>>r2shr)]
99 neg r2
100
101.loop:
102%if %3 == 4 ; filterSize == 4 scaling
103 ; load 2x4 or 4x4 source pixels into m0/m1
104 movsx r0, word [r5+r2*2+0] ; filterPos[0]
105 movsx r6, word [r5+r2*2+2] ; filterPos[1]
106 movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}]
107%if mmsize == 8
108 movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
109%else ; mmsize == 16
110%if %1 > 8
111 movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
112%else ; %1 == 8
113 movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
114%endif
115 movsx r0, word [r5+r2*2+4] ; filterPos[2]
116 movsx r6, word [r5+r2*2+6] ; filterPos[3]
117 movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}]
118%if %1 > 8
119 movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
120%else ; %1 == 8
121 movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
122 punpckldq m0, m4
123 punpckldq m1, m5
124%endif ; %1 == 8 && %5 <= ssse
125%endif ; mmsize == 8/16
126%if %1 == 8
127 punpcklbw m0, m3 ; byte -> word
128 punpcklbw m1, m3 ; byte -> word
129%endif ; %1 == 8
130
131 ; multiply with filter coefficients
132%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
133 ; add back 0x8000 * sum(coeffs) after the horizontal add
134 psubw m0, m6
135 psubw m1, m6
136%endif ; %1 == 16
137 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
138 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
139
140 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
141%if mmsize == 8 ; mmx
142 movq m4, m0
143 punpckldq m0, m1
144 punpckhdq m4, m1
145 paddd m0, m4
146%elifidn %5, sse2
147 mova m4, m0
148 shufps m0, m1, 10001000b
149 shufps m4, m1, 11011101b
150 paddd m0, m4
151%else ; ssse3/sse4
152 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
153 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
154 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
155 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
156%endif ; mmx/sse2/ssse3/sse4
157%else ; %3 == 8, i.e. filterSize == 8 scaling
158 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
159 movsx r0, word [r5+r2*1+0] ; filterPos[0]
160 movsx r6, word [r5+r2*1+2] ; filterPos[1]
161 movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
162%if mmsize == 8
163 movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
164 movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}]
165 movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
166%else ; mmsize == 16
167 movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
168 movsx r0, word [r5+r2*1+4] ; filterPos[2]
169 movsx r6, word [r5+r2*1+6] ; filterPos[3]
170 movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
171 movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
172%endif ; mmsize == 8/16
173%if %1 == 8
174 punpcklbw m0, m3 ; byte -> word
175 punpcklbw m1, m3 ; byte -> word
176 punpcklbw m4, m3 ; byte -> word
177 punpcklbw m5, m3 ; byte -> word
178%endif ; %1 == 8
179
180 ; multiply
181%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
182 ; add back 0x8000 * sum(coeffs) after the horizontal add
183 psubw m0, m6
184 psubw m1, m6
185 psubw m4, m6
186 psubw m5, m6
187%endif ; %1 == 16
188 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
189 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
190 pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
191 pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
192
193 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
194%if mmsize == 8
195 paddd m0, m1
196 paddd m4, m5
197 movq m1, m0
198 punpckldq m0, m4
199 punpckhdq m1, m4
200 paddd m0, m1
201%elifidn %5, sse2
202%if %1 == 8
203%define mex m6
204%else
205%define mex m3
206%endif
207 ; emulate horizontal add as transpose + vertical add
208 mova mex, m0
209 punpckldq m0, m1
210 punpckhdq mex, m1
211 paddd m0, mex
212 mova m1, m4
213 punpckldq m4, m5
214 punpckhdq m1, m5
215 paddd m4, m1
216 mova m1, m0
217 punpcklqdq m0, m4
218 punpckhqdq m1, m4
219 paddd m0, m1
220%else ; ssse3/sse4
221 ; FIXME if we rearrange the filter in pairs of 4, we can
222 ; load pixels likewise and use 2 x paddd + phaddd instead
223 ; of 3 x phaddd here, faster on older cpus
224 phaddd m0, m1
225 phaddd m4, m5
226 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
227 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
228 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
229 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
230%endif ; mmx/sse2/ssse3/sse4
231%endif ; %3 == 4/8
232
233%else ; %3 == X, i.e. any filterSize scaling
234
235%ifidn %4, X4
236%define r6sub 4
237%else ; %4 == X || %4 == X8
238%define r6sub 0
239%endif ; %4 ==/!= X4
240%ifdef ARCH_X86_64
241 push r12
242 movsxd r6, r6d ; filterSize
243 lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
244%define src_reg r11
245%define r1x r10
246%define filter2 r12
247%else ; x86-32
248 lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
249 mov r6m, r0
250%define src_reg r3
251%define r1x r1
252%define filter2 r6m
253%endif ; x86-32/64
254 lea r5, [r5+r2*2]
255%if %2 == 15
256 lea r1, [r1+r2*2]
257%else ; %2 == 19
258 lea r1, [r1+r2*4]
259%endif ; %2 == 15/19
260 movifnidn r1mp, r1
261 neg r2
262
263.loop:
264 movsx r0, word [r5+r2*2+0] ; filterPos[0]
265 movsx r1x, word [r5+r2*2+2] ; filterPos[1]
266 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
267 pxor m4, m4
268 pxor m5, m5
269 mov src_reg, r3mp
270
271.innerloop:
272 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
273 movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
274 movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
275%if %1 == 8
276 punpcklbw m0, m3
277 punpcklbw m1, m3
278%endif ; %1 == 8
279
280 ; multiply
281%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
282 ; add back 0x8000 * sum(coeffs) after the horizontal add
283 psubw m0, m6
284 psubw m1, m6
285%endif ; %1 == 16
286 pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}]
287 pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
288 paddd m4, m0
289 paddd m5, m1
290 add r4, mmsize
291 add src_reg, srcmul*mmsize/2
292 cmp src_reg, filter2 ; while (src += 4) < &src[filterSize]
293 jl .innerloop
294
295%ifidn %4, X4
296 movsx r1x, word [r5+r2*2+2] ; filterPos[1]
297 movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0]
298 sub r1x, r6 ; and first 4 srcpx of dstpx[1]
299%if %1 > 8
300 movhps m0, [src_reg+(r1x+r6sub)*srcmul]
301%else ; %1 == 8
302 movd m1, [src_reg+(r1x+r6sub)*srcmul]
303 punpckldq m0, m1
304%endif ; %1 == 8 && %5 <= ssse
305%if %1 == 8
306 punpcklbw m0, m3
307%endif ; %1 == 8
308%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
309 ; add back 0x8000 * sum(coeffs) after the horizontal add
310 psubw m0, m6
311%endif ; %1 == 16
312 pmaddwd m0, [r4]
313%endif ; %4 == X4
314
315 lea r4, [r4+(r6+r6sub)*2]
316
317%if mmsize == 8 ; mmx
318 movq m0, m4
319 punpckldq m4, m5
320 punpckhdq m0, m5
321 paddd m0, m4
322%else ; mmsize == 16
323%ifidn %5, sse2
324 mova m1, m4
325 punpcklqdq m4, m5
326 punpckhqdq m1, m5
327 paddd m4, m1
328%else ; ssse3/sse4
329 phaddd m4, m5
330%endif ; sse2/ssse3/sse4
331%ifidn %4, X4
332 paddd m4, m0
333%endif ; %3 == X4
334%ifidn %5, sse2
335 pshufd m4, m4, 11011000b
336 movhlps m0, m4
337 paddd m0, m4
338%else ; ssse3/sse4
339 phaddd m4, m4
340 SWAP 0, 4
341%endif ; sse2/ssse3/sse4
342%endif ; mmsize == 8/16
343%endif ; %3 ==/!= X
344
345%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
346 paddd m0, m7
347%endif ; %1 == 16
348
349 ; clip, store
350 psrad m0, 14 + %1 - %2
351%ifidn %3, X
352 movifnidn r1, r1mp
353%endif ; %3 == X
354%if %2 == 15
355 packssdw m0, m0
356%ifnidn %3, X
357 movh [r1+r2*(2>>r2shr)], m0
358%else ; %3 == X
359 movd [r1+r2*2], m0
360%endif ; %3 ==/!= X
361%else ; %2 == 19
362%if mmsize == 8
363 PMINSD_MMX m0, m2, m4
364%elifidn %5, sse4
365 pminsd m0, m2
366%else ; sse2/ssse3
367 cvtdq2ps m0, m0
368 minps m0, m2
369 cvtps2dq m0, m0
370%endif ; mmx/sse2/ssse3/sse4
371%ifnidn %3, X
372 movu [r1+r2*(4>>r2shr)], m0
373%else ; %3 == X
374 movq [r1+r2*4], m0
375%endif ; %3 ==/!= X
376%endif ; %2 == 15/19
377%ifnidn %3, X
378 add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
379 ; per iteration. see "shl r2,1" above as for why we do this
380%else ; %3 == X
381 add r2, 2
382%endif ; %3 ==/!= X
383 jl .loop
384%ifnidn %3, X
385 REP_RET
386%else ; %3 == X
387%ifdef ARCH_X86_64
388 pop r12
389 RET
390%else ; x86-32
391 REP_RET
392%endif ; x86-32/64
393%endif ; %3 ==/!= X
394%endmacro
395
396; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm
397%macro SCALE_FUNCS 4
398SCALE_FUNC %1, %2, 4, 4, %3, 6, %4
399SCALE_FUNC %1, %2, 8, 8, %3, 6, %4
400%if mmsize == 8
401SCALE_FUNC %1, %2, X, X, %3, 7, %4
402%else
403SCALE_FUNC %1, %2, X, X4, %3, 7, %4
404SCALE_FUNC %1, %2, X, X8, %3, 7, %4
405%endif
406%endmacro
407
408; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args
409%macro SCALE_FUNCS2 4
410%ifnidn %1, sse4
411SCALE_FUNCS 8, 15, %1, %2
412SCALE_FUNCS 9, 15, %1, %3
413SCALE_FUNCS 10, 15, %1, %3
414SCALE_FUNCS 16, 15, %1, %4
415%endif ; !sse4
416SCALE_FUNCS 8, 19, %1, %2
417SCALE_FUNCS 9, 19, %1, %3
418SCALE_FUNCS 10, 19, %1, %3
419SCALE_FUNCS 16, 19, %1, %4
420%endmacro
421
422%ifdef ARCH_X86_32
423INIT_MMX
424SCALE_FUNCS2 mmx, 0, 0, 0
425%endif
426INIT_XMM
427SCALE_FUNCS2 sse2, 6, 7, 8
428SCALE_FUNCS2 ssse3, 6, 6, 8
429SCALE_FUNCS2 sse4, 6, 6, 8