Fix linking if MMX is disabled.
[libav.git] / libavcodec / h264pred.c
1 /*
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * H.264 / AVC / MPEG4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
26 */
27
28 #include "avcodec.h"
29 #include "mpegvideo.h"
30 #include "h264pred.h"
31 #include "mathops.h"
32
33 static void pred4x4_vertical_c(uint8_t *src, const uint8_t *topright, int stride){
34 const uint32_t a= ((uint32_t*)(src-stride))[0];
35 ((uint32_t*)(src+0*stride))[0]= a;
36 ((uint32_t*)(src+1*stride))[0]= a;
37 ((uint32_t*)(src+2*stride))[0]= a;
38 ((uint32_t*)(src+3*stride))[0]= a;
39 }
40
41 static void pred4x4_horizontal_c(uint8_t *src, const uint8_t *topright, int stride){
42 ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
43 ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
44 ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
45 ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
46 }
47
48 static void pred4x4_dc_c(uint8_t *src, const uint8_t *topright, int stride){
49 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
50 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
51
52 ((uint32_t*)(src+0*stride))[0]=
53 ((uint32_t*)(src+1*stride))[0]=
54 ((uint32_t*)(src+2*stride))[0]=
55 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
56 }
57
58 static void pred4x4_left_dc_c(uint8_t *src, const uint8_t *topright, int stride){
59 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
60
61 ((uint32_t*)(src+0*stride))[0]=
62 ((uint32_t*)(src+1*stride))[0]=
63 ((uint32_t*)(src+2*stride))[0]=
64 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
65 }
66
67 static void pred4x4_top_dc_c(uint8_t *src, const uint8_t *topright, int stride){
68 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
69
70 ((uint32_t*)(src+0*stride))[0]=
71 ((uint32_t*)(src+1*stride))[0]=
72 ((uint32_t*)(src+2*stride))[0]=
73 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
74 }
75
76 static void pred4x4_128_dc_c(uint8_t *src, const uint8_t *topright, int stride){
77 ((uint32_t*)(src+0*stride))[0]=
78 ((uint32_t*)(src+1*stride))[0]=
79 ((uint32_t*)(src+2*stride))[0]=
80 ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
81 }
82
83
84 #define LOAD_TOP_RIGHT_EDGE\
85 const int av_unused t4= topright[0];\
86 const int av_unused t5= topright[1];\
87 const int av_unused t6= topright[2];\
88 const int av_unused t7= topright[3];\
89
90 #define LOAD_DOWN_LEFT_EDGE\
91 const int av_unused l4= src[-1+4*stride];\
92 const int av_unused l5= src[-1+5*stride];\
93 const int av_unused l6= src[-1+6*stride];\
94 const int av_unused l7= src[-1+7*stride];\
95
96 #define LOAD_LEFT_EDGE\
97 const int av_unused l0= src[-1+0*stride];\
98 const int av_unused l1= src[-1+1*stride];\
99 const int av_unused l2= src[-1+2*stride];\
100 const int av_unused l3= src[-1+3*stride];\
101
102 #define LOAD_TOP_EDGE\
103 const int av_unused t0= src[ 0-1*stride];\
104 const int av_unused t1= src[ 1-1*stride];\
105 const int av_unused t2= src[ 2-1*stride];\
106 const int av_unused t3= src[ 3-1*stride];\
107
108 static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
109 const int lt= src[-1-1*stride];
110 LOAD_TOP_EDGE
111 LOAD_TOP_RIGHT_EDGE
112 uint32_t v = PACK4UINT8((lt + 2*t0 + t1 + 2) >> 2,
113 (t0 + 2*t1 + t2 + 2) >> 2,
114 (t1 + 2*t2 + t3 + 2) >> 2,
115 (t2 + 2*t3 + t4 + 2) >> 2);
116
117 AV_WN32A(src+0*stride, v);
118 AV_WN32A(src+1*stride, v);
119 AV_WN32A(src+2*stride, v);
120 AV_WN32A(src+3*stride, v);
121 }
122
123 static void pred4x4_horizontal_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
124 const int lt= src[-1-1*stride];
125 LOAD_LEFT_EDGE
126
127 AV_WN32A(src+0*stride, ((lt + 2*l0 + l1 + 2) >> 2)*0x01010101);
128 AV_WN32A(src+1*stride, ((l0 + 2*l1 + l2 + 2) >> 2)*0x01010101);
129 AV_WN32A(src+2*stride, ((l1 + 2*l2 + l3 + 2) >> 2)*0x01010101);
130 AV_WN32A(src+3*stride, ((l2 + 2*l3 + l3 + 2) >> 2)*0x01010101);
131 }
132
133 static void pred4x4_down_right_c(uint8_t *src, const uint8_t *topright, int stride){
134 const int lt= src[-1-1*stride];
135 LOAD_TOP_EDGE
136 LOAD_LEFT_EDGE
137
138 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
139 src[0+2*stride]=
140 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
141 src[0+1*stride]=
142 src[1+2*stride]=
143 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
144 src[0+0*stride]=
145 src[1+1*stride]=
146 src[2+2*stride]=
147 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
148 src[1+0*stride]=
149 src[2+1*stride]=
150 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
151 src[2+0*stride]=
152 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
153 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
154 }
155
156 static void pred4x4_down_left_c(uint8_t *src, const uint8_t *topright, int stride){
157 LOAD_TOP_EDGE
158 LOAD_TOP_RIGHT_EDGE
159 // LOAD_LEFT_EDGE
160
161 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
162 src[1+0*stride]=
163 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
164 src[2+0*stride]=
165 src[1+1*stride]=
166 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
167 src[3+0*stride]=
168 src[2+1*stride]=
169 src[1+2*stride]=
170 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
171 src[3+1*stride]=
172 src[2+2*stride]=
173 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
174 src[3+2*stride]=
175 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
176 src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
177 }
178
179 static void pred4x4_down_left_svq3_c(uint8_t *src, const uint8_t *topright, int stride){
180 LOAD_TOP_EDGE
181 LOAD_LEFT_EDGE
182 const av_unused int unu0= t0;
183 const av_unused int unu1= l0;
184
185 src[0+0*stride]=(l1 + t1)>>1;
186 src[1+0*stride]=
187 src[0+1*stride]=(l2 + t2)>>1;
188 src[2+0*stride]=
189 src[1+1*stride]=
190 src[0+2*stride]=
191 src[3+0*stride]=
192 src[2+1*stride]=
193 src[1+2*stride]=
194 src[0+3*stride]=
195 src[3+1*stride]=
196 src[2+2*stride]=
197 src[1+3*stride]=
198 src[3+2*stride]=
199 src[2+3*stride]=
200 src[3+3*stride]=(l3 + t3)>>1;
201 }
202
203 static void pred4x4_down_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
204 LOAD_TOP_EDGE
205 LOAD_TOP_RIGHT_EDGE
206 LOAD_LEFT_EDGE
207 LOAD_DOWN_LEFT_EDGE
208
209 src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
210 src[1+0*stride]=
211 src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
212 src[2+0*stride]=
213 src[1+1*stride]=
214 src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
215 src[3+0*stride]=
216 src[2+1*stride]=
217 src[1+2*stride]=
218 src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
219 src[3+1*stride]=
220 src[2+2*stride]=
221 src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
222 src[3+2*stride]=
223 src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
224 src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
225 }
226
227 static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
228 LOAD_TOP_EDGE
229 LOAD_TOP_RIGHT_EDGE
230 LOAD_LEFT_EDGE
231
232 src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
233 src[1+0*stride]=
234 src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
235 src[2+0*stride]=
236 src[1+1*stride]=
237 src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
238 src[3+0*stride]=
239 src[2+1*stride]=
240 src[1+2*stride]=
241 src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
242 src[3+1*stride]=
243 src[2+2*stride]=
244 src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
245 src[3+2*stride]=
246 src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
247 src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
248 }
249
250 static void pred4x4_vertical_right_c(uint8_t *src, const uint8_t *topright, int stride){
251 const int lt= src[-1-1*stride];
252 LOAD_TOP_EDGE
253 LOAD_LEFT_EDGE
254
255 src[0+0*stride]=
256 src[1+2*stride]=(lt + t0 + 1)>>1;
257 src[1+0*stride]=
258 src[2+2*stride]=(t0 + t1 + 1)>>1;
259 src[2+0*stride]=
260 src[3+2*stride]=(t1 + t2 + 1)>>1;
261 src[3+0*stride]=(t2 + t3 + 1)>>1;
262 src[0+1*stride]=
263 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
264 src[1+1*stride]=
265 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
266 src[2+1*stride]=
267 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
268 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
269 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
270 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
271 }
272
273 static void pred4x4_vertical_left_c(uint8_t *src, const uint8_t *topright, int stride){
274 LOAD_TOP_EDGE
275 LOAD_TOP_RIGHT_EDGE
276
277 src[0+0*stride]=(t0 + t1 + 1)>>1;
278 src[1+0*stride]=
279 src[0+2*stride]=(t1 + t2 + 1)>>1;
280 src[2+0*stride]=
281 src[1+2*stride]=(t2 + t3 + 1)>>1;
282 src[3+0*stride]=
283 src[2+2*stride]=(t3 + t4+ 1)>>1;
284 src[3+2*stride]=(t4 + t5+ 1)>>1;
285 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
286 src[1+1*stride]=
287 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
288 src[2+1*stride]=
289 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
290 src[3+1*stride]=
291 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
292 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
293 }
294
295 static void pred4x4_vertical_left_rv40(uint8_t *src, const uint8_t *topright, int stride,
296 const int l0, const int l1, const int l2, const int l3, const int l4){
297 LOAD_TOP_EDGE
298 LOAD_TOP_RIGHT_EDGE
299
300 src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
301 src[1+0*stride]=
302 src[0+2*stride]=(t1 + t2 + 1)>>1;
303 src[2+0*stride]=
304 src[1+2*stride]=(t2 + t3 + 1)>>1;
305 src[3+0*stride]=
306 src[2+2*stride]=(t3 + t4+ 1)>>1;
307 src[3+2*stride]=(t4 + t5+ 1)>>1;
308 src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
309 src[1+1*stride]=
310 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
311 src[2+1*stride]=
312 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
313 src[3+1*stride]=
314 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
315 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
316 }
317
318 static void pred4x4_vertical_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
319 LOAD_LEFT_EDGE
320 LOAD_DOWN_LEFT_EDGE
321
322 pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l4);
323 }
324
325 static void pred4x4_vertical_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
326 LOAD_LEFT_EDGE
327
328 pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l3);
329 }
330
331 static void pred4x4_vertical_left_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
332 LOAD_TOP_EDGE
333 LOAD_TOP_RIGHT_EDGE
334
335 src[0+0*stride]=(t0 + t1 + 1)>>1;
336 src[1+0*stride]=
337 src[0+2*stride]=(t1 + t2 + 1)>>1;
338 src[2+0*stride]=
339 src[1+2*stride]=(t2 + t3 + 1)>>1;
340 src[3+0*stride]=
341 src[2+2*stride]=(t3 + t4 + 1)>>1;
342 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
343 src[1+1*stride]=
344 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
345 src[2+1*stride]=
346 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
347 src[3+1*stride]=
348 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
349 src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
350 src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
351 }
352
353 static void pred4x4_horizontal_up_c(uint8_t *src, const uint8_t *topright, int stride){
354 LOAD_LEFT_EDGE
355
356 src[0+0*stride]=(l0 + l1 + 1)>>1;
357 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
358 src[2+0*stride]=
359 src[0+1*stride]=(l1 + l2 + 1)>>1;
360 src[3+0*stride]=
361 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
362 src[2+1*stride]=
363 src[0+2*stride]=(l2 + l3 + 1)>>1;
364 src[3+1*stride]=
365 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
366 src[3+2*stride]=
367 src[1+3*stride]=
368 src[0+3*stride]=
369 src[2+2*stride]=
370 src[2+3*stride]=
371 src[3+3*stride]=l3;
372 }
373
374 static void pred4x4_horizontal_up_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
375 LOAD_LEFT_EDGE
376 LOAD_DOWN_LEFT_EDGE
377 LOAD_TOP_EDGE
378 LOAD_TOP_RIGHT_EDGE
379
380 src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
381 src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
382 src[2+0*stride]=
383 src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
384 src[3+0*stride]=
385 src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
386 src[2+1*stride]=
387 src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
388 src[3+1*stride]=
389 src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
390 src[3+2*stride]=
391 src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
392 src[0+3*stride]=
393 src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
394 src[2+3*stride]=(l4 + l5 + 1)>>1;
395 src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
396 }
397
398 static void pred4x4_horizontal_up_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
399 LOAD_LEFT_EDGE
400 LOAD_TOP_EDGE
401 LOAD_TOP_RIGHT_EDGE
402
403 src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
404 src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
405 src[2+0*stride]=
406 src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
407 src[3+0*stride]=
408 src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
409 src[2+1*stride]=
410 src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
411 src[3+1*stride]=
412 src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
413 src[3+2*stride]=
414 src[1+3*stride]=l3;
415 src[0+3*stride]=
416 src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
417 src[2+3*stride]=
418 src[3+3*stride]=l3;
419 }
420
421 static void pred4x4_horizontal_down_c(uint8_t *src, const uint8_t *topright, int stride){
422 const int lt= src[-1-1*stride];
423 LOAD_TOP_EDGE
424 LOAD_LEFT_EDGE
425
426 src[0+0*stride]=
427 src[2+1*stride]=(lt + l0 + 1)>>1;
428 src[1+0*stride]=
429 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
430 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
431 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
432 src[0+1*stride]=
433 src[2+2*stride]=(l0 + l1 + 1)>>1;
434 src[1+1*stride]=
435 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
436 src[0+2*stride]=
437 src[2+3*stride]=(l1 + l2+ 1)>>1;
438 src[1+2*stride]=
439 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
440 src[0+3*stride]=(l2 + l3 + 1)>>1;
441 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
442 }
443
444 static void pred4x4_tm_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
445 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
446 uint8_t *top = src-stride;
447 int y;
448
449 for (y = 0; y < 4; y++) {
450 uint8_t *cm_in = cm + src[-1];
451 src[0] = cm_in[top[0]];
452 src[1] = cm_in[top[1]];
453 src[2] = cm_in[top[2]];
454 src[3] = cm_in[top[3]];
455 src += stride;
456 }
457 }
458
459 static void pred16x16_vertical_c(uint8_t *src, int stride){
460 int i;
461 const uint32_t a= ((uint32_t*)(src-stride))[0];
462 const uint32_t b= ((uint32_t*)(src-stride))[1];
463 const uint32_t c= ((uint32_t*)(src-stride))[2];
464 const uint32_t d= ((uint32_t*)(src-stride))[3];
465
466 for(i=0; i<16; i++){
467 ((uint32_t*)(src+i*stride))[0]= a;
468 ((uint32_t*)(src+i*stride))[1]= b;
469 ((uint32_t*)(src+i*stride))[2]= c;
470 ((uint32_t*)(src+i*stride))[3]= d;
471 }
472 }
473
474 static void pred16x16_horizontal_c(uint8_t *src, int stride){
475 int i;
476
477 for(i=0; i<16; i++){
478 ((uint32_t*)(src+i*stride))[0]=
479 ((uint32_t*)(src+i*stride))[1]=
480 ((uint32_t*)(src+i*stride))[2]=
481 ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
482 }
483 }
484
485 static void pred16x16_dc_c(uint8_t *src, int stride){
486 int i, dc=0;
487
488 for(i=0;i<16; i++){
489 dc+= src[-1+i*stride];
490 }
491
492 for(i=0;i<16; i++){
493 dc+= src[i-stride];
494 }
495
496 dc= 0x01010101*((dc + 16)>>5);
497
498 for(i=0; i<16; i++){
499 ((uint32_t*)(src+i*stride))[0]=
500 ((uint32_t*)(src+i*stride))[1]=
501 ((uint32_t*)(src+i*stride))[2]=
502 ((uint32_t*)(src+i*stride))[3]= dc;
503 }
504 }
505
506 static void pred16x16_left_dc_c(uint8_t *src, int stride){
507 int i, dc=0;
508
509 for(i=0;i<16; i++){
510 dc+= src[-1+i*stride];
511 }
512
513 dc= 0x01010101*((dc + 8)>>4);
514
515 for(i=0; i<16; i++){
516 ((uint32_t*)(src+i*stride))[0]=
517 ((uint32_t*)(src+i*stride))[1]=
518 ((uint32_t*)(src+i*stride))[2]=
519 ((uint32_t*)(src+i*stride))[3]= dc;
520 }
521 }
522
523 static void pred16x16_top_dc_c(uint8_t *src, int stride){
524 int i, dc=0;
525
526 for(i=0;i<16; i++){
527 dc+= src[i-stride];
528 }
529 dc= 0x01010101*((dc + 8)>>4);
530
531 for(i=0; i<16; i++){
532 ((uint32_t*)(src+i*stride))[0]=
533 ((uint32_t*)(src+i*stride))[1]=
534 ((uint32_t*)(src+i*stride))[2]=
535 ((uint32_t*)(src+i*stride))[3]= dc;
536 }
537 }
538
539 static void pred16x16_128_dc_c(uint8_t *src, int stride){
540 int i;
541
542 for(i=0; i<16; i++){
543 ((uint32_t*)(src+i*stride))[0]=
544 ((uint32_t*)(src+i*stride))[1]=
545 ((uint32_t*)(src+i*stride))[2]=
546 ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
547 }
548 }
549
550 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
551 int i, j, k;
552 int a;
553 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
554 const uint8_t * const src0 = src+7-stride;
555 const uint8_t *src1 = src+8*stride-1;
556 const uint8_t *src2 = src1-2*stride; // == src+6*stride-1;
557 int H = src0[1] - src0[-1];
558 int V = src1[0] - src2[ 0];
559 for(k=2; k<=8; ++k) {
560 src1 += stride; src2 -= stride;
561 H += k*(src0[k] - src0[-k]);
562 V += k*(src1[0] - src2[ 0]);
563 }
564 if(svq3){
565 H = ( 5*(H/4) ) / 16;
566 V = ( 5*(V/4) ) / 16;
567
568 /* required for 100% accuracy */
569 i = H; H = V; V = i;
570 }else if(rv40){
571 H = ( H + (H>>2) ) >> 4;
572 V = ( V + (V>>2) ) >> 4;
573 }else{
574 H = ( 5*H+32 ) >> 6;
575 V = ( 5*V+32 ) >> 6;
576 }
577
578 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
579 for(j=16; j>0; --j) {
580 int b = a;
581 a += V;
582 for(i=-16; i<0; i+=4) {
583 src[16+i] = cm[ (b ) >> 5 ];
584 src[17+i] = cm[ (b+ H) >> 5 ];
585 src[18+i] = cm[ (b+2*H) >> 5 ];
586 src[19+i] = cm[ (b+3*H) >> 5 ];
587 b += 4*H;
588 }
589 src += stride;
590 }
591 }
592
593 static void pred16x16_plane_c(uint8_t *src, int stride){
594 pred16x16_plane_compat_c(src, stride, 0, 0);
595 }
596
597 static void pred16x16_plane_svq3_c(uint8_t *src, int stride){
598 pred16x16_plane_compat_c(src, stride, 1, 0);
599 }
600
601 static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
602 pred16x16_plane_compat_c(src, stride, 0, 1);
603 }
604
605 static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
606 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
607 uint8_t *top = src-stride;
608 int y;
609
610 for (y = 0; y < 16; y++) {
611 uint8_t *cm_in = cm + src[-1];
612 src[0] = cm_in[top[0]];
613 src[1] = cm_in[top[1]];
614 src[2] = cm_in[top[2]];
615 src[3] = cm_in[top[3]];
616 src[4] = cm_in[top[4]];
617 src[5] = cm_in[top[5]];
618 src[6] = cm_in[top[6]];
619 src[7] = cm_in[top[7]];
620 src[8] = cm_in[top[8]];
621 src[9] = cm_in[top[9]];
622 src[10] = cm_in[top[10]];
623 src[11] = cm_in[top[11]];
624 src[12] = cm_in[top[12]];
625 src[13] = cm_in[top[13]];
626 src[14] = cm_in[top[14]];
627 src[15] = cm_in[top[15]];
628 src += stride;
629 }
630 }
631
632 static void pred8x8_vertical_c(uint8_t *src, int stride){
633 int i;
634 const uint32_t a= ((uint32_t*)(src-stride))[0];
635 const uint32_t b= ((uint32_t*)(src-stride))[1];
636
637 for(i=0; i<8; i++){
638 ((uint32_t*)(src+i*stride))[0]= a;
639 ((uint32_t*)(src+i*stride))[1]= b;
640 }
641 }
642
643 static void pred8x8_horizontal_c(uint8_t *src, int stride){
644 int i;
645
646 for(i=0; i<8; i++){
647 ((uint32_t*)(src+i*stride))[0]=
648 ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
649 }
650 }
651
652 static void pred8x8_128_dc_c(uint8_t *src, int stride){
653 int i;
654
655 for(i=0; i<8; i++){
656 ((uint32_t*)(src+i*stride))[0]=
657 ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
658 }
659 }
660
661 static void pred8x8_left_dc_c(uint8_t *src, int stride){
662 int i;
663 int dc0, dc2;
664
665 dc0=dc2=0;
666 for(i=0;i<4; i++){
667 dc0+= src[-1+i*stride];
668 dc2+= src[-1+(i+4)*stride];
669 }
670 dc0= 0x01010101*((dc0 + 2)>>2);
671 dc2= 0x01010101*((dc2 + 2)>>2);
672
673 for(i=0; i<4; i++){
674 ((uint32_t*)(src+i*stride))[0]=
675 ((uint32_t*)(src+i*stride))[1]= dc0;
676 }
677 for(i=4; i<8; i++){
678 ((uint32_t*)(src+i*stride))[0]=
679 ((uint32_t*)(src+i*stride))[1]= dc2;
680 }
681 }
682
683 static void pred8x8_left_dc_rv40_c(uint8_t *src, int stride){
684 int i;
685 int dc0;
686
687 dc0=0;
688 for(i=0;i<8; i++)
689 dc0+= src[-1+i*stride];
690 dc0= 0x01010101*((dc0 + 4)>>3);
691
692 for(i=0; i<8; i++){
693 ((uint32_t*)(src+i*stride))[0]=
694 ((uint32_t*)(src+i*stride))[1]= dc0;
695 }
696 }
697
698 static void pred8x8_top_dc_c(uint8_t *src, int stride){
699 int i;
700 int dc0, dc1;
701
702 dc0=dc1=0;
703 for(i=0;i<4; i++){
704 dc0+= src[i-stride];
705 dc1+= src[4+i-stride];
706 }
707 dc0= 0x01010101*((dc0 + 2)>>2);
708 dc1= 0x01010101*((dc1 + 2)>>2);
709
710 for(i=0; i<4; i++){
711 ((uint32_t*)(src+i*stride))[0]= dc0;
712 ((uint32_t*)(src+i*stride))[1]= dc1;
713 }
714 for(i=4; i<8; i++){
715 ((uint32_t*)(src+i*stride))[0]= dc0;
716 ((uint32_t*)(src+i*stride))[1]= dc1;
717 }
718 }
719
720 static void pred8x8_top_dc_rv40_c(uint8_t *src, int stride){
721 int i;
722 int dc0;
723
724 dc0=0;
725 for(i=0;i<8; i++)
726 dc0+= src[i-stride];
727 dc0= 0x01010101*((dc0 + 4)>>3);
728
729 for(i=0; i<8; i++){
730 ((uint32_t*)(src+i*stride))[0]=
731 ((uint32_t*)(src+i*stride))[1]= dc0;
732 }
733 }
734
735
736 static void pred8x8_dc_c(uint8_t *src, int stride){
737 int i;
738 int dc0, dc1, dc2, dc3;
739
740 dc0=dc1=dc2=0;
741 for(i=0;i<4; i++){
742 dc0+= src[-1+i*stride] + src[i-stride];
743 dc1+= src[4+i-stride];
744 dc2+= src[-1+(i+4)*stride];
745 }
746 dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
747 dc0= 0x01010101*((dc0 + 4)>>3);
748 dc1= 0x01010101*((dc1 + 2)>>2);
749 dc2= 0x01010101*((dc2 + 2)>>2);
750
751 for(i=0; i<4; i++){
752 ((uint32_t*)(src+i*stride))[0]= dc0;
753 ((uint32_t*)(src+i*stride))[1]= dc1;
754 }
755 for(i=4; i<8; i++){
756 ((uint32_t*)(src+i*stride))[0]= dc2;
757 ((uint32_t*)(src+i*stride))[1]= dc3;
758 }
759 }
760
761 //the following 4 function should not be optimized!
762 static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){
763 pred8x8_top_dc_c(src, stride);
764 pred4x4_dc_c(src, NULL, stride);
765 }
766
767 static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){
768 pred8x8_dc_c(src, stride);
769 pred4x4_top_dc_c(src, NULL, stride);
770 }
771
772 static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){
773 pred8x8_left_dc_c(src, stride);
774 pred4x4_128_dc_c(src + 4*stride , NULL, stride);
775 pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride);
776 }
777
778 static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){
779 pred8x8_left_dc_c(src, stride);
780 pred4x4_128_dc_c(src , NULL, stride);
781 pred4x4_128_dc_c(src + 4, NULL, stride);
782 }
783
784 static void pred8x8_dc_rv40_c(uint8_t *src, int stride){
785 int i;
786 int dc0=0;
787
788 for(i=0;i<4; i++){
789 dc0+= src[-1+i*stride] + src[i-stride];
790 dc0+= src[4+i-stride];
791 dc0+= src[-1+(i+4)*stride];
792 }
793 dc0= 0x01010101*((dc0 + 8)>>4);
794
795 for(i=0; i<4; i++){
796 ((uint32_t*)(src+i*stride))[0]= dc0;
797 ((uint32_t*)(src+i*stride))[1]= dc0;
798 }
799 for(i=4; i<8; i++){
800 ((uint32_t*)(src+i*stride))[0]= dc0;
801 ((uint32_t*)(src+i*stride))[1]= dc0;
802 }
803 }
804
805 static void pred8x8_plane_c(uint8_t *src, int stride){
806 int j, k;
807 int a;
808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
809 const uint8_t * const src0 = src+3-stride;
810 const uint8_t *src1 = src+4*stride-1;
811 const uint8_t *src2 = src1-2*stride; // == src+2*stride-1;
812 int H = src0[1] - src0[-1];
813 int V = src1[0] - src2[ 0];
814 for(k=2; k<=4; ++k) {
815 src1 += stride; src2 -= stride;
816 H += k*(src0[k] - src0[-k]);
817 V += k*(src1[0] - src2[ 0]);
818 }
819 H = ( 17*H+16 ) >> 5;
820 V = ( 17*V+16 ) >> 5;
821
822 a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
823 for(j=8; j>0; --j) {
824 int b = a;
825 a += V;
826 src[0] = cm[ (b ) >> 5 ];
827 src[1] = cm[ (b+ H) >> 5 ];
828 src[2] = cm[ (b+2*H) >> 5 ];
829 src[3] = cm[ (b+3*H) >> 5 ];
830 src[4] = cm[ (b+4*H) >> 5 ];
831 src[5] = cm[ (b+5*H) >> 5 ];
832 src[6] = cm[ (b+6*H) >> 5 ];
833 src[7] = cm[ (b+7*H) >> 5 ];
834 src += stride;
835 }
836 }
837
838 static void pred8x8_tm_vp8_c(uint8_t *src, int stride){
839 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
840 uint8_t *top = src-stride;
841 int y;
842
843 for (y = 0; y < 8; y++) {
844 uint8_t *cm_in = cm + src[-1];
845 src[0] = cm_in[top[0]];
846 src[1] = cm_in[top[1]];
847 src[2] = cm_in[top[2]];
848 src[3] = cm_in[top[3]];
849 src[4] = cm_in[top[4]];
850 src[5] = cm_in[top[5]];
851 src[6] = cm_in[top[6]];
852 src[7] = cm_in[top[7]];
853 src += stride;
854 }
855 }
856
857 #define SRC(x,y) src[(x)+(y)*stride]
858 #define PL(y) \
859 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
860 #define PREDICT_8x8_LOAD_LEFT \
861 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
862 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
863 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
864 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
865
866 #define PT(x) \
867 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
868 #define PREDICT_8x8_LOAD_TOP \
869 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
870 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
871 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
872 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
873 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
874
875 #define PTR(x) \
876 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
877 #define PREDICT_8x8_LOAD_TOPRIGHT \
878 int t8, t9, t10, t11, t12, t13, t14, t15; \
879 if(has_topright) { \
880 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
881 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
882 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
883
884 #define PREDICT_8x8_LOAD_TOPLEFT \
885 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
886
887 #define PREDICT_8x8_DC(v) \
888 int y; \
889 for( y = 0; y < 8; y++ ) { \
890 ((uint32_t*)src)[0] = \
891 ((uint32_t*)src)[1] = v; \
892 src += stride; \
893 }
894
895 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
896 {
897 PREDICT_8x8_DC(0x80808080);
898 }
899 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
900 {
901 PREDICT_8x8_LOAD_LEFT;
902 const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
903 PREDICT_8x8_DC(dc);
904 }
905 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
906 {
907 PREDICT_8x8_LOAD_TOP;
908 const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
909 PREDICT_8x8_DC(dc);
910 }
911 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
912 {
913 PREDICT_8x8_LOAD_LEFT;
914 PREDICT_8x8_LOAD_TOP;
915 const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
916 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
917 PREDICT_8x8_DC(dc);
918 }
919 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
920 {
921 PREDICT_8x8_LOAD_LEFT;
922 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
923 ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
924 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
925 #undef ROW
926 }
927 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
928 {
929 int y;
930 PREDICT_8x8_LOAD_TOP;
931 src[0] = t0;
932 src[1] = t1;
933 src[2] = t2;
934 src[3] = t3;
935 src[4] = t4;
936 src[5] = t5;
937 src[6] = t6;
938 src[7] = t7;
939 for( y = 1; y < 8; y++ )
940 *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
941 }
942 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
943 {
944 PREDICT_8x8_LOAD_TOP;
945 PREDICT_8x8_LOAD_TOPRIGHT;
946 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
947 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
948 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
949 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
950 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
951 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
952 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
953 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
954 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
955 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
956 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
957 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
958 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
959 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
960 SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
961 }
962 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
963 {
964 PREDICT_8x8_LOAD_TOP;
965 PREDICT_8x8_LOAD_LEFT;
966 PREDICT_8x8_LOAD_TOPLEFT;
967 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
968 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
969 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
970 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
971 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
972 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
973 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
974 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
975 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
976 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
977 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
978 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
979 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
980 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
981 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
982
983 }
984 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
985 {
986 PREDICT_8x8_LOAD_TOP;
987 PREDICT_8x8_LOAD_LEFT;
988 PREDICT_8x8_LOAD_TOPLEFT;
989 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
990 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
991 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
992 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
993 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
994 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
995 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
996 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
997 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
998 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
999 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1000 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1001 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1002 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1003 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1004 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1005 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1006 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1007 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1008 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1009 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1010 SRC(7,0)= (t6 + t7 + 1) >> 1;
1011 }
1012 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1013 {
1014 PREDICT_8x8_LOAD_TOP;
1015 PREDICT_8x8_LOAD_LEFT;
1016 PREDICT_8x8_LOAD_TOPLEFT;
1017 SRC(0,7)= (l6 + l7 + 1) >> 1;
1018 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1019 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1020 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1021 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1022 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1023 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1024 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1025 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1026 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1027 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1028 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1029 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1030 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1031 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1032 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1033 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1034 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1035 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1036 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1037 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1038 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1039 }
1040 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1041 {
1042 PREDICT_8x8_LOAD_TOP;
1043 PREDICT_8x8_LOAD_TOPRIGHT;
1044 SRC(0,0)= (t0 + t1 + 1) >> 1;
1045 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1046 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1047 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1048 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1049 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1050 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1051 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1052 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1053 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1054 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1055 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1056 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1057 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1058 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1059 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1060 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1061 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1062 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1063 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1064 SRC(7,6)= (t10 + t11 + 1) >> 1;
1065 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1066 }
1067 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
1068 {
1069 PREDICT_8x8_LOAD_LEFT;
1070 SRC(0,0)= (l0 + l1 + 1) >> 1;
1071 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1072 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1073 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1074 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1075 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1076 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1077 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1078 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1079 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1080 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1081 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1082 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1083 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1084 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1085 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1086 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1087 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1088 }
1089 #undef PREDICT_8x8_LOAD_LEFT
1090 #undef PREDICT_8x8_LOAD_TOP
1091 #undef PREDICT_8x8_LOAD_TOPLEFT
1092 #undef PREDICT_8x8_LOAD_TOPRIGHT
1093 #undef PREDICT_8x8_DC
1094 #undef PTR
1095 #undef PT
1096 #undef PL
1097 #undef SRC
1098
1099 static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1100 int i;
1101 pix -= stride;
1102 for(i=0; i<4; i++){
1103 uint8_t v = pix[0];
1104 pix[1*stride]= v += block[0];
1105 pix[2*stride]= v += block[4];
1106 pix[3*stride]= v += block[8];
1107 pix[4*stride]= v + block[12];
1108 pix++;
1109 block++;
1110 }
1111 }
1112
1113 static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1114 int i;
1115 for(i=0; i<4; i++){
1116 uint8_t v = pix[-1];
1117 pix[0]= v += block[0];
1118 pix[1]= v += block[1];
1119 pix[2]= v += block[2];
1120 pix[3]= v + block[3];
1121 pix+= stride;
1122 block+= 4;
1123 }
1124 }
1125
1126 static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1127 int i;
1128 pix -= stride;
1129 for(i=0; i<8; i++){
1130 uint8_t v = pix[0];
1131 pix[1*stride]= v += block[0];
1132 pix[2*stride]= v += block[8];
1133 pix[3*stride]= v += block[16];
1134 pix[4*stride]= v += block[24];
1135 pix[5*stride]= v += block[32];
1136 pix[6*stride]= v += block[40];
1137 pix[7*stride]= v += block[48];
1138 pix[8*stride]= v + block[56];
1139 pix++;
1140 block++;
1141 }
1142 }
1143
1144 static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
1145 int i;
1146 for(i=0; i<8; i++){
1147 uint8_t v = pix[-1];
1148 pix[0]= v += block[0];
1149 pix[1]= v += block[1];
1150 pix[2]= v += block[2];
1151 pix[3]= v += block[3];
1152 pix[4]= v += block[4];
1153 pix[5]= v += block[5];
1154 pix[6]= v += block[6];
1155 pix[7]= v + block[7];
1156 pix+= stride;
1157 block+= 8;
1158 }
1159 }
1160
1161 static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1162 int i;
1163 for(i=0; i<16; i++)
1164 pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
1165 }
1166
1167 static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1168 int i;
1169 for(i=0; i<16; i++)
1170 pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
1171 }
1172
1173 static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1174 int i;
1175 for(i=0; i<4; i++)
1176 pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
1177 }
1178
1179 static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1180 int i;
1181 for(i=0; i<4; i++)
1182 pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
1183 }
1184
1185
1186 /**
1187 * Sets the intra prediction function pointers.
1188 */
1189 void ff_h264_pred_init(H264PredContext *h, int codec_id){
1190 // MpegEncContext * const s = &h->s;
1191
1192 if(codec_id != CODEC_ID_RV40){
1193 if(codec_id == CODEC_ID_VP8) {
1194 h->pred4x4[VERT_PRED ]= pred4x4_vertical_vp8_c;
1195 h->pred4x4[HOR_PRED ]= pred4x4_horizontal_vp8_c;
1196 } else {
1197 h->pred4x4[VERT_PRED ]= pred4x4_vertical_c;
1198 h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c;
1199 }
1200 h->pred4x4[DC_PRED ]= pred4x4_dc_c;
1201 if(codec_id == CODEC_ID_SVQ3)
1202 h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_svq3_c;
1203 else
1204 h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
1205 h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
1206 h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c;
1207 h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c;
1208 if (codec_id == CODEC_ID_VP8) {
1209 h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_vp8_c;
1210 } else
1211 h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c;
1212 h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c;
1213 h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c;
1214 h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c;
1215 h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c;
1216 if(codec_id == CODEC_ID_VP8)
1217 h->pred4x4[TM_VP8_PRED ]= pred4x4_tm_vp8_c;
1218 }else{
1219 h->pred4x4[VERT_PRED ]= pred4x4_vertical_c;
1220 h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c;
1221 h->pred4x4[DC_PRED ]= pred4x4_dc_c;
1222 h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_rv40_c;
1223 h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
1224 h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c;
1225 h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c;
1226 h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_rv40_c;
1227 h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_rv40_c;
1228 h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c;
1229 h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c;
1230 h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c;
1231 h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN]= pred4x4_down_left_rv40_nodown_c;
1232 h->pred4x4[HOR_UP_PRED_RV40_NODOWN]= pred4x4_horizontal_up_rv40_nodown_c;
1233 h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN]= pred4x4_vertical_left_rv40_nodown_c;
1234 }
1235
1236 h->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c;
1237 h->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c;
1238 h->pred8x8l[DC_PRED ]= pred8x8l_dc_c;
1239 h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
1240 h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
1241 h->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c;
1242 h->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c;
1243 h->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c;
1244 h->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c;
1245 h->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c;
1246 h->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c;
1247 h->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c;
1248
1249 h->pred8x8[VERT_PRED8x8 ]= pred8x8_vertical_c;
1250 h->pred8x8[HOR_PRED8x8 ]= pred8x8_horizontal_c;
1251 if (codec_id != CODEC_ID_VP8) {
1252 h->pred8x8[PLANE_PRED8x8]= pred8x8_plane_c;
1253 } else
1254 h->pred8x8[PLANE_PRED8x8]= pred8x8_tm_vp8_c;
1255 if(codec_id != CODEC_ID_RV40 && codec_id != CODEC_ID_VP8){
1256 h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_c;
1257 h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
1258 h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
1259 h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t;
1260 h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt;
1261 h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00;
1262 h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0;
1263 }else{
1264 h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_rv40_c;
1265 h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_rv40_c;
1266 h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_rv40_c;
1267 }
1268 h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
1269
1270 h->pred16x16[DC_PRED8x8 ]= pred16x16_dc_c;
1271 h->pred16x16[VERT_PRED8x8 ]= pred16x16_vertical_c;
1272 h->pred16x16[HOR_PRED8x8 ]= pred16x16_horizontal_c;
1273 h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c;
1274 switch(codec_id){
1275 case CODEC_ID_SVQ3:
1276 h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_svq3_c;
1277 break;
1278 case CODEC_ID_RV40:
1279 h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_rv40_c;
1280 break;
1281 case CODEC_ID_VP8:
1282 h->pred16x16[PLANE_PRED8x8 ]= pred16x16_tm_vp8_c;
1283 break;
1284 default:
1285 h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c;
1286 }
1287 h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
1288 h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
1289 h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
1290
1291 //special lossless h/v prediction for h264
1292 h->pred4x4_add [VERT_PRED ]= pred4x4_vertical_add_c;
1293 h->pred4x4_add [ HOR_PRED ]= pred4x4_horizontal_add_c;
1294 h->pred8x8l_add [VERT_PRED ]= pred8x8l_vertical_add_c;
1295 h->pred8x8l_add [ HOR_PRED ]= pred8x8l_horizontal_add_c;
1296 h->pred8x8_add [VERT_PRED8x8]= pred8x8_vertical_add_c;
1297 h->pred8x8_add [ HOR_PRED8x8]= pred8x8_horizontal_add_c;
1298 h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c;
1299 h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c;
1300
1301 if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id);
1302 if (HAVE_MMX) ff_h264_pred_init_x86(h, codec_id);
1303 }