vp8: Add ifdef guards around the sse2 loopfilter in the sse2slow branch too
[libav.git] / libavcodec / x86 / vp8dsp-init.c
CommitLineData
0178d14f
JGG
1/*
2 * VP8 DSP functions x86-optimized
3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 *
2912e87a 6 * This file is part of Libav.
0178d14f 7 *
2912e87a 8 * Libav is free software; you can redistribute it and/or
0178d14f
JGG
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
2912e87a 13 * Libav is distributed in the hope that it will be useful,
0178d14f
JGG
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
2912e87a 19 * License along with Libav; if not, write to the Free Software
0178d14f
JGG
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
c6c98d08 23#include "libavutil/cpu.h"
0178d14f
JGG
24#include "libavutil/x86_cpu.h"
25#include "libavcodec/vp8dsp.h"
26
a173aa89
JGG
27#if HAVE_YASM
28
0178d14f
JGG
29/*
30 * MC functions
31 */
e25be471 32extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 33 uint8_t *src, ptrdiff_t srcstride,
0178d14f 34 int height, int mx, int my);
e25be471 35extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 36 uint8_t *src, ptrdiff_t srcstride,
0178d14f 37 int height, int mx, int my);
e25be471 38extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 39 uint8_t *src, ptrdiff_t srcstride,
0178d14f 40 int height, int mx, int my);
e25be471 41extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 42 uint8_t *src, ptrdiff_t srcstride,
0178d14f
JGG
43 int height, int mx, int my);
44
bd66f073
RB
45extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
46 uint8_t *src, ptrdiff_t srcstride,
0178d14f 47 int height, int mx, int my);
bd66f073
RB
48extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
49 uint8_t *src, ptrdiff_t srcstride,
0178d14f 50 int height, int mx, int my);
bd66f073
RB
51extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
52 uint8_t *src, ptrdiff_t srcstride,
0178d14f 53 int height, int mx, int my);
bd66f073
RB
54extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
55 uint8_t *src, ptrdiff_t srcstride,
0178d14f
JGG
56 int height, int mx, int my);
57
bd66f073
RB
58extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
59 uint8_t *src, ptrdiff_t srcstride,
dcc602d8 60 int height, int mx, int my);
bd66f073
RB
61extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
62 uint8_t *src, ptrdiff_t srcstride,
dcc602d8 63 int height, int mx, int my);
bd66f073
RB
64extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
65 uint8_t *src, ptrdiff_t srcstride,
dcc602d8 66 int height, int mx, int my);
bd66f073
RB
67extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
68 uint8_t *src, ptrdiff_t srcstride,
dcc602d8 69 int height, int mx, int my);
bd66f073
RB
70extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
71 uint8_t *src, ptrdiff_t srcstride,
0178d14f 72 int height, int mx, int my);
bd66f073
RB
73extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
74 uint8_t *src, ptrdiff_t srcstride,
0178d14f 75 int height, int mx, int my);
bd66f073
RB
76extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
77 uint8_t *src, ptrdiff_t srcstride,
0178d14f 78 int height, int mx, int my);
bd66f073
RB
79extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
80 uint8_t *src, ptrdiff_t srcstride,
0178d14f
JGG
81 int height, int mx, int my);
82
e25be471 83extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 84 uint8_t *src, ptrdiff_t srcstride,
a173aa89 85 int height, int mx, int my);
bd66f073
RB
86extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
87 uint8_t *src, ptrdiff_t srcstride,
a173aa89 88 int height, int mx, int my);
bd66f073
RB
89extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
90 uint8_t *src, ptrdiff_t srcstride,
b06855f1 91 int height, int mx, int my);
bd66f073
RB
92extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
93 uint8_t *src, ptrdiff_t srcstride,
b06855f1 94 int height, int mx, int my);
a173aa89 95
e25be471 96extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
bd66f073 97 uint8_t *src, ptrdiff_t srcstride,
a173aa89 98 int height, int mx, int my);
bd66f073
RB
99extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
100 uint8_t *src, ptrdiff_t srcstride,
a173aa89 101 int height, int mx, int my);
bd66f073
RB
102extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
103 uint8_t *src, ptrdiff_t srcstride,
a173aa89 104 int height, int mx, int my);
bd66f073
RB
105extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
106 uint8_t *src, ptrdiff_t srcstride,
a173aa89
JGG
107 int height, int mx, int my);
108
b06855f1 109
bd66f073
RB
110extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
111 uint8_t *src, ptrdiff_t srcstride,
0fecad09 112 int height, int mx, int my);
bd66f073
RB
113extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
114 uint8_t *src, ptrdiff_t srcstride,
0fecad09 115 int height, int mx, int my);
bd66f073
RB
116extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
117 uint8_t *src, ptrdiff_t srcstride,
0fecad09
JGG
118 int height, int mx, int my);
119
a173aa89
JGG
120#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
121static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
bd66f073
RB
122 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
123 ptrdiff_t srcstride, int height, int mx, int my) \
0178d14f 124{ \
a173aa89
JGG
125 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
126 dst, dststride, src, srcstride, height, mx, my); \
127 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
128 dst + 8, dststride, src + 8, srcstride, height, mx, my); \
0178d14f 129}
a173aa89
JGG
130#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
131static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
bd66f073
RB
132 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
133 ptrdiff_t srcstride, int height, int mx, int my) \
0178d14f 134{ \
a173aa89
JGG
135 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
136 dst, dststride, src, srcstride, height, mx, my); \
137 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
138 dst + 4, dststride, src + 4, srcstride, height, mx, my); \
0178d14f
JGG
139}
140
45549339 141#if ARCH_X86_32
e25be471
RB
142TAP_W8 (mmx2, epel, h4)
143TAP_W8 (mmx2, epel, h6)
144TAP_W16(mmx2, epel, h6)
145TAP_W8 (mmx2, epel, v4)
146TAP_W8 (mmx2, epel, v6)
147TAP_W16(mmx2, epel, v6)
148TAP_W8 (mmx2, bilinear, h)
149TAP_W16(mmx2, bilinear, h)
150TAP_W8 (mmx2, bilinear, v)
151TAP_W16(mmx2, bilinear, v)
45549339 152#endif
a173aa89 153
e25be471
RB
154TAP_W16(sse2, epel, h6)
155TAP_W16(sse2, epel, v6)
156TAP_W16(sse2, bilinear, h)
157TAP_W16(sse2, bilinear, v)
a173aa89 158
e25be471
RB
159TAP_W16(ssse3, epel, h6)
160TAP_W16(ssse3, epel, v6)
161TAP_W16(ssse3, bilinear, h)
162TAP_W16(ssse3, bilinear, v)
0178d14f
JGG
163
164#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
a173aa89 165static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
bd66f073
RB
166 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
167 ptrdiff_t srcstride, int height, int mx, int my) \
0178d14f
JGG
168{ \
169 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
170 uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
171 src -= srcstride * (TAPNUMY / 2 - 1); \
a173aa89
JGG
172 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
173 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
174 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
175 dst, dststride, tmpptr, SIZE, height, mx, my); \
0178d14f
JGG
176}
177
45549339 178#if ARCH_X86_32
0178d14f 179#define HVTAPMMX(x, y) \
e25be471
RB
180HVTAP(mmx2, 8, x, y, 4, 8) \
181HVTAP(mmx2, 8, x, y, 8, 16)
0178d14f 182
e25be471 183HVTAP(mmx2, 8, 6, 6, 16, 16)
45549339
RB
184#else
185#define HVTAPMMX(x, y) \
e25be471 186HVTAP(mmx2, 8, x, y, 4, 8)
45549339
RB
187#endif
188
0178d14f
JGG
189HVTAPMMX(4, 4)
190HVTAPMMX(4, 6)
191HVTAPMMX(6, 4)
192HVTAPMMX(6, 6)
0178d14f
JGG
193
194#define HVTAPSSE2(x, y, w) \
195HVTAP(sse2, 16, x, y, w, 16) \
196HVTAP(ssse3, 16, x, y, w, 16)
197
198HVTAPSSE2(4, 4, 8)
199HVTAPSSE2(4, 6, 8)
200HVTAPSSE2(6, 4, 8)
201HVTAPSSE2(6, 6, 8)
202HVTAPSSE2(6, 6, 16)
a173aa89 203
dcc602d8
JGG
204HVTAP(ssse3, 16, 4, 4, 4, 8)
205HVTAP(ssse3, 16, 4, 6, 4, 8)
206HVTAP(ssse3, 16, 6, 4, 4, 8)
207HVTAP(ssse3, 16, 6, 6, 4, 8)
208
a173aa89
JGG
209#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
210static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
bd66f073
RB
211 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
212 ptrdiff_t srcstride, int height, int mx, int my) \
a173aa89
JGG
213{ \
214 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
215 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
216 tmp, SIZE, src, srcstride, height + 1, mx, my); \
217 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
218 dst, dststride, tmp, SIZE, height, mx, my); \
219}
220
e25be471 221HVBILIN(mmx2, 8, 4, 8)
45549339 222#if ARCH_X86_32
e25be471
RB
223HVBILIN(mmx2, 8, 8, 16)
224HVBILIN(mmx2, 8, 16, 16)
45549339 225#endif
e25be471
RB
226HVBILIN(sse2, 8, 8, 16)
227HVBILIN(sse2, 8, 16, 16)
228HVBILIN(ssse3, 8, 4, 8)
229HVBILIN(ssse3, 8, 8, 16)
230HVBILIN(ssse3, 8, 16, 16)
0178d14f 231
bd66f073
RB
232extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
233 ptrdiff_t stride);
234extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16],
235 ptrdiff_t stride);
236extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16],
237 ptrdiff_t stride);
238extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16],
239 ptrdiff_t stride);
240extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16],
241 ptrdiff_t stride);
b8b231b5 242extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
827d43bb 243extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
bd66f073
RB
244extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16],
245 ptrdiff_t stride);
246extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16],
247 ptrdiff_t stride);
f2a30bd8 248
7dd224a4 249#define DECLARE_LOOP_FILTER(NAME)\
bd66f073
RB
250extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
251 ptrdiff_t stride, \
252 int flim);\
253extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
254 ptrdiff_t stride, \
255 int flim);\
256extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
257 ptrdiff_t stride,\
7dd224a4 258 int e, int i, int hvt);\
bd66f073
RB
259extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
260 ptrdiff_t stride,\
7dd224a4 261 int e, int i, int hvt);\
bd66f073
RB
262extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
263 uint8_t *dstV,\
264 ptrdiff_t s, \
7dd224a4 265 int e, int i, int hvt);\
bd66f073
RB
266extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
267 uint8_t *dstV,\
268 ptrdiff_t s, \
7dd224a4 269 int e, int i, int hvt);\
bd66f073
RB
270extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
271 ptrdiff_t stride,\
272 int e, int i, int hvt);\
273extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
274 ptrdiff_t stride,\
275 int e, int i, int hvt);\
276extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
277 uint8_t *dstV,\
278 ptrdiff_t s, \
279 int e, int i, int hvt);\
280extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
281 uint8_t *dstV,\
282 ptrdiff_t s, \
283 int e, int i, int hvt);
7dd224a4
JGG
284
285DECLARE_LOOP_FILTER(mmx)
e25be471 286DECLARE_LOOP_FILTER(mmx2)
7dd224a4
JGG
287DECLARE_LOOP_FILTER(sse2)
288DECLARE_LOOP_FILTER(ssse3)
dc5eec80 289DECLARE_LOOP_FILTER(sse4)
7dd224a4 290
a173aa89
JGG
291#endif
292
293#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
294 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
295 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
296 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
297
298#define VP8_MC_FUNC(IDX, SIZE, OPT) \
299 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
300 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
301 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
302 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
303 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
304 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
305
306#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
307 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
308 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
309 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
310 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
311 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
312 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
313 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
314 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
315
0178d14f
JGG
316
317av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
318{
265980da 319#if HAVE_YASM
c6c98d08 320 int mm_flags = av_get_cpu_flags();
0178d14f 321
7160bb71 322 if (mm_flags & AV_CPU_FLAG_MMX) {
3ae079a3 323 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
3ae079a3 324 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
45549339
RB
325#if ARCH_X86_32
326 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
3ae079a3
JGG
327 c->vp8_idct_add = ff_vp8_idct_add_mmx;
328 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
0fecad09
JGG
329 c->put_vp8_epel_pixels_tab[0][0][0] =
330 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
45549339 331#endif
0fecad09
JGG
332 c->put_vp8_epel_pixels_tab[1][0][0] =
333 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
f2a30bd8 334
45549339 335#if ARCH_X86_32
f2a30bd8
RB
336 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
337 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
a711eb48 338
3facfc99
RB
339 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
340 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
268821e7
RB
341 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
342 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
e9e456d8
RB
343
344 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
345 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
346 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
347 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
45549339 348#endif
0178d14f
JGG
349 }
350
351 /* note that 4-tap width=16 functions are missing because w=16
352 * is only used for luma, and luma is always a copy or sixtap. */
7160bb71 353 if (mm_flags & AV_CPU_FLAG_MMX2) {
e25be471
RB
354 VP8_MC_FUNC(2, 4, mmx2);
355 VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
45549339 356#if ARCH_X86_32
e25be471
RB
357 VP8_LUMA_MC_FUNC(0, 16, mmx2);
358 VP8_MC_FUNC(1, 8, mmx2);
359 VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
360 VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
361
362 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
363 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
364
365 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
366 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
367 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
368 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
369
370 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
371 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
372 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
373 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
45549339 374#endif
0178d14f
JGG
375 }
376
7160bb71 377 if (mm_flags & AV_CPU_FLAG_SSE) {
c25c7767 378 c->vp8_idct_add = ff_vp8_idct_add_sse;
827d43bb 379 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
0fecad09
JGG
380 c->put_vp8_epel_pixels_tab[0][0][0] =
381 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
382 }
383
7160bb71 384 if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
a173aa89
JGG
385 VP8_LUMA_MC_FUNC(0, 16, sse2);
386 VP8_MC_FUNC(1, 8, sse2);
387 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
388 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
f2a30bd8
RB
389
390 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
a711eb48 391
07eeeb1d 392#if ARCH_X86_64 || HAVE_ALIGNED_STACK
3facfc99 393 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
268821e7 394 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
e9e456d8 395
003243c3
RB
396 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
397 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
07eeeb1d 398#endif
6526976f
RB
399 }
400
7160bb71 401 if (mm_flags & AV_CPU_FLAG_SSE2) {
3ae079a3 402 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
8a467b2d 403
6341838f
RB
404 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
405
70a1c800 406#if ARCH_X86_64 || HAVE_ALIGNED_STACK
6526976f 407 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
268821e7 408 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
e9e456d8 409
003243c3
RB
410 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
411 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
70a1c800 412#endif
0178d14f
JGG
413 }
414
7160bb71 415 if (mm_flags & AV_CPU_FLAG_SSSE3) {
a173aa89
JGG
416 VP8_LUMA_MC_FUNC(0, 16, ssse3);
417 VP8_MC_FUNC(1, 8, ssse3);
dcc602d8 418 VP8_MC_FUNC(2, 4, ssse3);
a173aa89
JGG
419 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
420 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
b06855f1 421 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
7dd224a4
JGG
422
423 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
424 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
425
70a1c800 426#if ARCH_X86_64 || HAVE_ALIGNED_STACK
7dd224a4
JGG
427 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
428 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
429 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
430 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
431
432 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
003243c3 433 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
7dd224a4 434 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
003243c3 435 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
70a1c800 436#endif
0178d14f
JGG
437 }
438
7160bb71 439 if (mm_flags & AV_CPU_FLAG_SSE4) {
0178d14f 440 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
dc5eec80 441
6341838f 442 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
70a1c800 443#if ARCH_X86_64 || HAVE_ALIGNED_STACK
dc5eec80
RB
444 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
445 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
70a1c800 446#endif
0178d14f 447 }
30bdefd1 448#endif
0178d14f 449}