vp8: Add ifdef guards around the sse2 loopfilter in the sse2slow branch too
[libav.git] / libavcodec / x86 / vp8dsp-init.c
1 /*
2 * VP8 DSP functions x86-optimized
3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 *
6 * This file is part of Libav.
7 *
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/cpu.h"
24 #include "libavutil/x86_cpu.h"
25 #include "libavcodec/vp8dsp.h"
26
27 #if HAVE_YASM
28
29 /*
30 * MC functions
31 */
32 extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
33 uint8_t *src, ptrdiff_t srcstride,
34 int height, int mx, int my);
35 extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
36 uint8_t *src, ptrdiff_t srcstride,
37 int height, int mx, int my);
38 extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
39 uint8_t *src, ptrdiff_t srcstride,
40 int height, int mx, int my);
41 extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
42 uint8_t *src, ptrdiff_t srcstride,
43 int height, int mx, int my);
44
45 extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
46 uint8_t *src, ptrdiff_t srcstride,
47 int height, int mx, int my);
48 extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
49 uint8_t *src, ptrdiff_t srcstride,
50 int height, int mx, int my);
51 extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
52 uint8_t *src, ptrdiff_t srcstride,
53 int height, int mx, int my);
54 extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
55 uint8_t *src, ptrdiff_t srcstride,
56 int height, int mx, int my);
57
58 extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
59 uint8_t *src, ptrdiff_t srcstride,
60 int height, int mx, int my);
61 extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
62 uint8_t *src, ptrdiff_t srcstride,
63 int height, int mx, int my);
64 extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
65 uint8_t *src, ptrdiff_t srcstride,
66 int height, int mx, int my);
67 extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
68 uint8_t *src, ptrdiff_t srcstride,
69 int height, int mx, int my);
70 extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
71 uint8_t *src, ptrdiff_t srcstride,
72 int height, int mx, int my);
73 extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
74 uint8_t *src, ptrdiff_t srcstride,
75 int height, int mx, int my);
76 extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
77 uint8_t *src, ptrdiff_t srcstride,
78 int height, int mx, int my);
79 extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
80 uint8_t *src, ptrdiff_t srcstride,
81 int height, int mx, int my);
82
83 extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
84 uint8_t *src, ptrdiff_t srcstride,
85 int height, int mx, int my);
86 extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
87 uint8_t *src, ptrdiff_t srcstride,
88 int height, int mx, int my);
89 extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
90 uint8_t *src, ptrdiff_t srcstride,
91 int height, int mx, int my);
92 extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
93 uint8_t *src, ptrdiff_t srcstride,
94 int height, int mx, int my);
95
96 extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
97 uint8_t *src, ptrdiff_t srcstride,
98 int height, int mx, int my);
99 extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
100 uint8_t *src, ptrdiff_t srcstride,
101 int height, int mx, int my);
102 extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
103 uint8_t *src, ptrdiff_t srcstride,
104 int height, int mx, int my);
105 extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
106 uint8_t *src, ptrdiff_t srcstride,
107 int height, int mx, int my);
108
109
110 extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
111 uint8_t *src, ptrdiff_t srcstride,
112 int height, int mx, int my);
113 extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
114 uint8_t *src, ptrdiff_t srcstride,
115 int height, int mx, int my);
116 extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
117 uint8_t *src, ptrdiff_t srcstride,
118 int height, int mx, int my);
119
120 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
121 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
122 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
123 ptrdiff_t srcstride, int height, int mx, int my) \
124 { \
125 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
126 dst, dststride, src, srcstride, height, mx, my); \
127 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
128 dst + 8, dststride, src + 8, srcstride, height, mx, my); \
129 }
130 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
131 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
132 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
133 ptrdiff_t srcstride, int height, int mx, int my) \
134 { \
135 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
136 dst, dststride, src, srcstride, height, mx, my); \
137 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
138 dst + 4, dststride, src + 4, srcstride, height, mx, my); \
139 }
140
141 #if ARCH_X86_32
142 TAP_W8 (mmx2, epel, h4)
143 TAP_W8 (mmx2, epel, h6)
144 TAP_W16(mmx2, epel, h6)
145 TAP_W8 (mmx2, epel, v4)
146 TAP_W8 (mmx2, epel, v6)
147 TAP_W16(mmx2, epel, v6)
148 TAP_W8 (mmx2, bilinear, h)
149 TAP_W16(mmx2, bilinear, h)
150 TAP_W8 (mmx2, bilinear, v)
151 TAP_W16(mmx2, bilinear, v)
152 #endif
153
154 TAP_W16(sse2, epel, h6)
155 TAP_W16(sse2, epel, v6)
156 TAP_W16(sse2, bilinear, h)
157 TAP_W16(sse2, bilinear, v)
158
159 TAP_W16(ssse3, epel, h6)
160 TAP_W16(ssse3, epel, v6)
161 TAP_W16(ssse3, bilinear, h)
162 TAP_W16(ssse3, bilinear, v)
163
164 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
165 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
166 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
167 ptrdiff_t srcstride, int height, int mx, int my) \
168 { \
169 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
170 uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
171 src -= srcstride * (TAPNUMY / 2 - 1); \
172 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
173 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
174 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
175 dst, dststride, tmpptr, SIZE, height, mx, my); \
176 }
177
178 #if ARCH_X86_32
179 #define HVTAPMMX(x, y) \
180 HVTAP(mmx2, 8, x, y, 4, 8) \
181 HVTAP(mmx2, 8, x, y, 8, 16)
182
183 HVTAP(mmx2, 8, 6, 6, 16, 16)
184 #else
185 #define HVTAPMMX(x, y) \
186 HVTAP(mmx2, 8, x, y, 4, 8)
187 #endif
188
189 HVTAPMMX(4, 4)
190 HVTAPMMX(4, 6)
191 HVTAPMMX(6, 4)
192 HVTAPMMX(6, 6)
193
194 #define HVTAPSSE2(x, y, w) \
195 HVTAP(sse2, 16, x, y, w, 16) \
196 HVTAP(ssse3, 16, x, y, w, 16)
197
198 HVTAPSSE2(4, 4, 8)
199 HVTAPSSE2(4, 6, 8)
200 HVTAPSSE2(6, 4, 8)
201 HVTAPSSE2(6, 6, 8)
202 HVTAPSSE2(6, 6, 16)
203
204 HVTAP(ssse3, 16, 4, 4, 4, 8)
205 HVTAP(ssse3, 16, 4, 6, 4, 8)
206 HVTAP(ssse3, 16, 6, 4, 4, 8)
207 HVTAP(ssse3, 16, 6, 6, 4, 8)
208
209 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
210 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
211 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
212 ptrdiff_t srcstride, int height, int mx, int my) \
213 { \
214 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
215 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
216 tmp, SIZE, src, srcstride, height + 1, mx, my); \
217 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
218 dst, dststride, tmp, SIZE, height, mx, my); \
219 }
220
221 HVBILIN(mmx2, 8, 4, 8)
222 #if ARCH_X86_32
223 HVBILIN(mmx2, 8, 8, 16)
224 HVBILIN(mmx2, 8, 16, 16)
225 #endif
226 HVBILIN(sse2, 8, 8, 16)
227 HVBILIN(sse2, 8, 16, 16)
228 HVBILIN(ssse3, 8, 4, 8)
229 HVBILIN(ssse3, 8, 8, 16)
230 HVBILIN(ssse3, 8, 16, 16)
231
232 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
233 ptrdiff_t stride);
234 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16],
235 ptrdiff_t stride);
236 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16],
237 ptrdiff_t stride);
238 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16],
239 ptrdiff_t stride);
240 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16],
241 ptrdiff_t stride);
242 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
243 extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
244 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16],
245 ptrdiff_t stride);
246 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16],
247 ptrdiff_t stride);
248
249 #define DECLARE_LOOP_FILTER(NAME)\
250 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
251 ptrdiff_t stride, \
252 int flim);\
253 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
254 ptrdiff_t stride, \
255 int flim);\
256 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
257 ptrdiff_t stride,\
258 int e, int i, int hvt);\
259 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
260 ptrdiff_t stride,\
261 int e, int i, int hvt);\
262 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
263 uint8_t *dstV,\
264 ptrdiff_t s, \
265 int e, int i, int hvt);\
266 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
267 uint8_t *dstV,\
268 ptrdiff_t s, \
269 int e, int i, int hvt);\
270 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
271 ptrdiff_t stride,\
272 int e, int i, int hvt);\
273 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
274 ptrdiff_t stride,\
275 int e, int i, int hvt);\
276 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
277 uint8_t *dstV,\
278 ptrdiff_t s, \
279 int e, int i, int hvt);\
280 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
281 uint8_t *dstV,\
282 ptrdiff_t s, \
283 int e, int i, int hvt);
284
285 DECLARE_LOOP_FILTER(mmx)
286 DECLARE_LOOP_FILTER(mmx2)
287 DECLARE_LOOP_FILTER(sse2)
288 DECLARE_LOOP_FILTER(ssse3)
289 DECLARE_LOOP_FILTER(sse4)
290
291 #endif
292
293 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
294 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
295 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
296 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
297
298 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
299 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
300 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
301 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
302 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
303 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
304 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
305
306 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
307 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
308 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
309 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
310 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
311 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
312 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
313 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
314 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
315
316
317 av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
318 {
319 #if HAVE_YASM
320 int mm_flags = av_get_cpu_flags();
321
322 if (mm_flags & AV_CPU_FLAG_MMX) {
323 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
324 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
325 #if ARCH_X86_32
326 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
327 c->vp8_idct_add = ff_vp8_idct_add_mmx;
328 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
329 c->put_vp8_epel_pixels_tab[0][0][0] =
330 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
331 #endif
332 c->put_vp8_epel_pixels_tab[1][0][0] =
333 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
334
335 #if ARCH_X86_32
336 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
337 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
338
339 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
340 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
341 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
342 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
343
344 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
345 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
346 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
347 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
348 #endif
349 }
350
351 /* note that 4-tap width=16 functions are missing because w=16
352 * is only used for luma, and luma is always a copy or sixtap. */
353 if (mm_flags & AV_CPU_FLAG_MMX2) {
354 VP8_MC_FUNC(2, 4, mmx2);
355 VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
356 #if ARCH_X86_32
357 VP8_LUMA_MC_FUNC(0, 16, mmx2);
358 VP8_MC_FUNC(1, 8, mmx2);
359 VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
360 VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
361
362 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
363 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
364
365 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
366 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
367 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
368 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
369
370 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
371 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
372 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
373 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
374 #endif
375 }
376
377 if (mm_flags & AV_CPU_FLAG_SSE) {
378 c->vp8_idct_add = ff_vp8_idct_add_sse;
379 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
380 c->put_vp8_epel_pixels_tab[0][0][0] =
381 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
382 }
383
384 if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
385 VP8_LUMA_MC_FUNC(0, 16, sse2);
386 VP8_MC_FUNC(1, 8, sse2);
387 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
388 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
389
390 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
391
392 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
393 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
394 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
395
396 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
397 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
398 #endif
399 }
400
401 if (mm_flags & AV_CPU_FLAG_SSE2) {
402 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
403
404 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
405
406 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
407 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
408 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
409
410 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
411 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
412 #endif
413 }
414
415 if (mm_flags & AV_CPU_FLAG_SSSE3) {
416 VP8_LUMA_MC_FUNC(0, 16, ssse3);
417 VP8_MC_FUNC(1, 8, ssse3);
418 VP8_MC_FUNC(2, 4, ssse3);
419 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
420 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
421 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
422
423 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
424 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
425
426 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
427 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
428 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
429 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
430 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
431
432 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
433 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
434 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
435 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
436 #endif
437 }
438
439 if (mm_flags & AV_CPU_FLAG_SSE4) {
440 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
441
442 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
443 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
444 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
445 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
446 #endif
447 }
448 #endif
449 }