b5d6d3cc6580ce5a02c952f88f3fecacb0fbbc2c
[libav.git] / libavcodec / x86 / dsputil.asm
1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "libavutil/x86/x86util.asm"
23
24 SECTION_RODATA
25 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
26
27 SECTION_TEXT
28
29 %macro SCALARPRODUCT 0
30 ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
31 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
32 shl orderq, 1
33 add v1q, orderq
34 add v2q, orderq
35 neg orderq
36 pxor m2, m2
37 .loop:
38 movu m0, [v1q + orderq]
39 movu m1, [v1q + orderq + mmsize]
40 pmaddwd m0, [v2q + orderq]
41 pmaddwd m1, [v2q + orderq + mmsize]
42 paddd m2, m0
43 paddd m2, m1
44 add orderq, mmsize*2
45 jl .loop
46 %if mmsize == 16
47 movhlps m0, m2
48 paddd m2, m0
49 pshuflw m0, m2, 0x4e
50 %else
51 pshufw m0, m2, 0x4e
52 %endif
53 paddd m2, m0
54 movd eax, m2
55 RET
56 %endmacro
57
58 INIT_MMX mmxext
59 SCALARPRODUCT
60 INIT_XMM sse2
61 SCALARPRODUCT
62
63
64 ;-----------------------------------------------------------------------------
65 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
66 ; int32_t max, unsigned int len)
67 ;-----------------------------------------------------------------------------
68
69 ; %1 = number of xmm registers used
70 ; %2 = number of inline load/process/store loops per asm loop
71 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
72 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
73 ; %5 = suffix
74 %macro VECTOR_CLIP_INT32 4-5
75 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
76 %if %4
77 cvtsi2ss m4, minm
78 cvtsi2ss m5, maxm
79 %else
80 movd m4, minm
81 movd m5, maxm
82 %endif
83 SPLATD m4
84 SPLATD m5
85 .loop:
86 %assign %%i 1
87 %rep %2
88 mova m0, [srcq+mmsize*0*%%i]
89 mova m1, [srcq+mmsize*1*%%i]
90 mova m2, [srcq+mmsize*2*%%i]
91 mova m3, [srcq+mmsize*3*%%i]
92 %if %3
93 mova m7, [srcq+mmsize*4*%%i]
94 mova m8, [srcq+mmsize*5*%%i]
95 mova m9, [srcq+mmsize*6*%%i]
96 mova m10, [srcq+mmsize*7*%%i]
97 %endif
98 CLIPD m0, m4, m5, m6
99 CLIPD m1, m4, m5, m6
100 CLIPD m2, m4, m5, m6
101 CLIPD m3, m4, m5, m6
102 %if %3
103 CLIPD m7, m4, m5, m6
104 CLIPD m8, m4, m5, m6
105 CLIPD m9, m4, m5, m6
106 CLIPD m10, m4, m5, m6
107 %endif
108 mova [dstq+mmsize*0*%%i], m0
109 mova [dstq+mmsize*1*%%i], m1
110 mova [dstq+mmsize*2*%%i], m2
111 mova [dstq+mmsize*3*%%i], m3
112 %if %3
113 mova [dstq+mmsize*4*%%i], m7
114 mova [dstq+mmsize*5*%%i], m8
115 mova [dstq+mmsize*6*%%i], m9
116 mova [dstq+mmsize*7*%%i], m10
117 %endif
118 %assign %%i %%i+1
119 %endrep
120 add srcq, mmsize*4*(%2+%3)
121 add dstq, mmsize*4*(%2+%3)
122 sub lend, mmsize*(%2+%3)
123 jg .loop
124 REP_RET
125 %endmacro
126
127 INIT_MMX mmx
128 %define CLIPD CLIPD_MMX
129 VECTOR_CLIP_INT32 0, 1, 0, 0
130 INIT_XMM sse2
131 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
132 %define CLIPD CLIPD_SSE2
133 VECTOR_CLIP_INT32 6, 2, 0, 1
134 INIT_XMM sse4
135 %define CLIPD CLIPD_SSE41
136 %ifdef m8
137 VECTOR_CLIP_INT32 11, 1, 1, 0
138 %else
139 VECTOR_CLIP_INT32 6, 1, 0, 0
140 %endif
141
142 ; %1 = aligned/unaligned
143 %macro BSWAP_LOOPS 1
144 mov r3, r2
145 sar r2, 3
146 jz .left4_%1
147 .loop8_%1:
148 mov%1 m0, [r1 + 0]
149 mov%1 m1, [r1 + 16]
150 %if cpuflag(ssse3)
151 pshufb m0, m2
152 pshufb m1, m2
153 mov%1 [r0 + 0], m0
154 mov%1 [r0 + 16], m1
155 %else
156 pshuflw m0, m0, 10110001b
157 pshuflw m1, m1, 10110001b
158 pshufhw m0, m0, 10110001b
159 pshufhw m1, m1, 10110001b
160 mova m2, m0
161 mova m3, m1
162 psllw m0, 8
163 psllw m1, 8
164 psrlw m2, 8
165 psrlw m3, 8
166 por m2, m0
167 por m3, m1
168 mov%1 [r0 + 0], m2
169 mov%1 [r0 + 16], m3
170 %endif
171 add r0, 32
172 add r1, 32
173 dec r2
174 jnz .loop8_%1
175 .left4_%1:
176 mov r2, r3
177 and r3, 4
178 jz .left
179 mov%1 m0, [r1]
180 %if cpuflag(ssse3)
181 pshufb m0, m2
182 mov%1 [r0], m0
183 %else
184 pshuflw m0, m0, 10110001b
185 pshufhw m0, m0, 10110001b
186 mova m2, m0
187 psllw m0, 8
188 psrlw m2, 8
189 por m2, m0
190 mov%1 [r0], m2
191 %endif
192 add r1, 16
193 add r0, 16
194 %endmacro
195
196 ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
197 %macro BSWAP32_BUF 0
198 %if cpuflag(ssse3)
199 cglobal bswap32_buf, 3,4,3
200 mov r3, r1
201 mova m2, [pb_bswap32]
202 %else
203 cglobal bswap32_buf, 3,4,5
204 mov r3, r1
205 %endif
206 and r3, 15
207 jz .start_align
208 BSWAP_LOOPS u
209 jmp .left
210 .start_align:
211 BSWAP_LOOPS a
212 .left:
213 %if cpuflag(ssse3)
214 mov r3, r2
215 and r2, 2
216 jz .left1
217 movq m0, [r1]
218 pshufb m0, m2
219 movq [r0], m0
220 add r1, 8
221 add r0, 8
222 .left1:
223 and r3, 1
224 jz .end
225 mov r2d, [r1]
226 bswap r2d
227 mov [r0], r2d
228 %else
229 and r2, 3
230 jz .end
231 .loop2:
232 mov r3d, [r1]
233 bswap r3d
234 mov [r0], r3d
235 add r1, 4
236 add r0, 4
237 dec r2
238 jnz .loop2
239 %endif
240 .end:
241 RET
242 %endmacro
243
244 INIT_XMM sse2
245 BSWAP32_BUF
246
247 INIT_XMM ssse3
248 BSWAP32_BUF