hadamard8_diff* enabled on linux/ppc
[libav.git] / libavcodec / ppc / dsputil_ppc.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "../dsputil.h"
22
23 #include "dsputil_ppc.h"
24
25 #ifdef HAVE_ALTIVEC
26 #include "dsputil_altivec.h"
27 #endif
28
29 extern void fdct_altivec(int16_t *block);
30 extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
31 extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
32
33 extern void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width);
34 extern void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1,
35 DWTELEM *b2, DWTELEM *b3,
36 DWTELEM *b4, DWTELEM *b5,
37 int width);
38 extern void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
39 uint8_t * * block, int b_w, int b_h,
40 int src_x, int src_y, int src_stride,
41 slice_buffer * sb, int add,
42 uint8_t * dst8);
43
44 int mm_flags = 0;
45
46 int mm_support(void)
47 {
48 int result = 0;
49 #ifdef HAVE_ALTIVEC
50 if (has_altivec()) {
51 result |= MM_ALTIVEC;
52 }
53 #endif /* result */
54 return result;
55 }
56
57 #ifdef POWERPC_PERFORMANCE_REPORT
58 unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
59 /* list below must match enum in dsputil_ppc.h */
60 static unsigned char* perfname[] = {
61 "ff_fft_calc_altivec",
62 "gmc1_altivec",
63 "dct_unquantize_h263_altivec",
64 "fdct_altivec",
65 "idct_add_altivec",
66 "idct_put_altivec",
67 "put_pixels16_altivec",
68 "avg_pixels16_altivec",
69 "avg_pixels8_altivec",
70 "put_pixels8_xy2_altivec",
71 "put_no_rnd_pixels8_xy2_altivec",
72 "put_pixels16_xy2_altivec",
73 "put_no_rnd_pixels16_xy2_altivec",
74 "hadamard8_diff8x8_altivec",
75 "hadamard8_diff16_altivec",
76 "avg_pixels8_xy2_altivec",
77 "clear_blocks_dcbz32_ppc",
78 "clear_blocks_dcbz128_ppc",
79 "put_h264_chroma_mc8_altivec",
80 "avg_h264_chroma_mc8_altivec",
81 "put_h264_qpel16_h_lowpass_altivec",
82 "avg_h264_qpel16_h_lowpass_altivec",
83 "put_h264_qpel16_v_lowpass_altivec",
84 "avg_h264_qpel16_v_lowpass_altivec",
85 "put_h264_qpel16_hv_lowpass_altivec",
86 "avg_h264_qpel16_hv_lowpass_altivec",
87 ""
88 };
89 #include <stdio.h>
90 #endif
91
92 #ifdef POWERPC_PERFORMANCE_REPORT
93 void powerpc_display_perf_report(void)
94 {
95 int i, j;
96 av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
97 for(i = 0 ; i < powerpc_perf_total ; i++)
98 {
99 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
100 {
101 if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
102 av_log(NULL, AV_LOG_INFO,
103 " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
104 perfname[i],
105 j+1,
106 perfdata[j][i][powerpc_data_min],
107 perfdata[j][i][powerpc_data_max],
108 (double)perfdata[j][i][powerpc_data_sum] /
109 (double)perfdata[j][i][powerpc_data_num],
110 perfdata[j][i][powerpc_data_num]);
111 }
112 }
113 }
114 #endif /* POWERPC_PERFORMANCE_REPORT */
115
116 /* ***** WARNING ***** WARNING ***** WARNING ***** */
117 /*
118 clear_blocks_dcbz32_ppc will not work properly
119 on PowerPC processors with a cache line size
120 not equal to 32 bytes.
121 Fortunately all processor used by Apple up to
122 at least the 7450 (aka second generation G4)
123 use 32 bytes cache line.
124 This is due to the use of the 'dcbz' instruction.
125 It simply clear to zero a single cache line,
126 so you need to know the cache line size to use it !
127 It's absurd, but it's fast...
128
129 update 24/06/2003 : Apple released yesterday the G5,
130 with a PPC970. cache line size : 128 bytes. Oups.
131 The semantic of dcbz was changed, it always clear
132 32 bytes. so the function below will work, but will
133 be slow. So I fixed check_dcbz_effect to use dcbzl,
134 which is defined to clear a cache line (as dcbz before).
135 So we still can distinguish, and use dcbz (32 bytes)
136 or dcbzl (one cache line) as required.
137
138 see <http://developer.apple.com/technotes/tn/tn2087.html>
139 and <http://developer.apple.com/technotes/tn/tn2086.html>
140 */
141 void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
142 {
143 POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
144 register int misal = ((unsigned long)blocks & 0x00000010);
145 register int i = 0;
146 POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
147 #if 1
148 if (misal) {
149 ((unsigned long*)blocks)[0] = 0L;
150 ((unsigned long*)blocks)[1] = 0L;
151 ((unsigned long*)blocks)[2] = 0L;
152 ((unsigned long*)blocks)[3] = 0L;
153 i += 16;
154 }
155 for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
156 #ifndef __MWERKS__
157 asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
158 #else
159 __dcbz( blocks, i );
160 #endif
161 }
162 if (misal) {
163 ((unsigned long*)blocks)[188] = 0L;
164 ((unsigned long*)blocks)[189] = 0L;
165 ((unsigned long*)blocks)[190] = 0L;
166 ((unsigned long*)blocks)[191] = 0L;
167 i += 16;
168 }
169 #else
170 memset(blocks, 0, sizeof(DCTELEM)*6*64);
171 #endif
172 POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
173 }
174
175 /* same as above, when dcbzl clear a whole 128B cache line
176 i.e. the PPC970 aka G5 */
177 #ifndef NO_DCBZL
178 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
179 {
180 POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
181 register int misal = ((unsigned long)blocks & 0x0000007f);
182 register int i = 0;
183 POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
184 #if 1
185 if (misal) {
186 // we could probably also optimize this case,
187 // but there's not much point as the machines
188 // aren't available yet (2003-06-26)
189 memset(blocks, 0, sizeof(DCTELEM)*6*64);
190 }
191 else
192 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
193 asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
194 }
195 #else
196 memset(blocks, 0, sizeof(DCTELEM)*6*64);
197 #endif
198 POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
199 }
200 #else
201 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
202 {
203 memset(blocks, 0, sizeof(DCTELEM)*6*64);
204 }
205 #endif
206
207 #ifndef NO_DCBZL
208 /* check dcbz report how many bytes are set to 0 by dcbz */
209 /* update 24/06/2003 : replace dcbz by dcbzl to get
210 the intended effect (Apple "fixed" dcbz)
211 unfortunately this cannot be used unless the assembler
212 knows about dcbzl ... */
213 long check_dcbzl_effect(void)
214 {
215 register char *fakedata = (char*)av_malloc(1024);
216 register char *fakedata_middle;
217 register long zero = 0;
218 register long i = 0;
219 long count = 0;
220
221 if (!fakedata)
222 {
223 return 0L;
224 }
225
226 fakedata_middle = (fakedata + 512);
227
228 memset(fakedata, 0xFF, 1024);
229
230 /* below the constraint "b" seems to mean "Address base register"
231 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
232 asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
233
234 for (i = 0; i < 1024 ; i ++)
235 {
236 if (fakedata[i] == (char)0)
237 count++;
238 }
239
240 av_free(fakedata);
241
242 return count;
243 }
244 #else
245 long check_dcbzl_effect(void)
246 {
247 return 0;
248 }
249 #endif
250
251
252 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
253
254 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
255 {
256 // Common optimizations whether Altivec is available or not
257
258 switch (check_dcbzl_effect()) {
259 case 32:
260 c->clear_blocks = clear_blocks_dcbz32_ppc;
261 break;
262 case 128:
263 c->clear_blocks = clear_blocks_dcbz128_ppc;
264 break;
265 default:
266 break;
267 }
268
269 #ifdef HAVE_ALTIVEC
270 dsputil_h264_init_ppc(c, avctx);
271
272 if (has_altivec()) {
273 mm_flags |= MM_ALTIVEC;
274
275 // Altivec specific optimisations
276 c->pix_abs[0][1] = sad16_x2_altivec;
277 c->pix_abs[0][2] = sad16_y2_altivec;
278 c->pix_abs[0][3] = sad16_xy2_altivec;
279 c->pix_abs[0][0] = sad16_altivec;
280 c->pix_abs[1][0] = sad8_altivec;
281 c->sad[0]= sad16_altivec;
282 c->sad[1]= sad8_altivec;
283 c->pix_norm1 = pix_norm1_altivec;
284 c->sse[1]= sse8_altivec;
285 c->sse[0]= sse16_altivec;
286 c->pix_sum = pix_sum_altivec;
287 c->diff_pixels = diff_pixels_altivec;
288 c->get_pixels = get_pixels_altivec;
289 // next one disabled as it's untested.
290 #if 0
291 c->add_bytes= add_bytes_altivec;
292 #endif /* 0 */
293 c->put_pixels_tab[0][0] = put_pixels16_altivec;
294 /* the two functions do the same thing, so use the same code */
295 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
296 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
297 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
298 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
299 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
300 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
301 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
302 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
303
304 c->gmc1 = gmc1_altivec;
305
306 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
307 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
308
309
310 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
311 c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
312 c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
313
314 #ifdef CONFIG_ENCODERS
315 if (avctx->dct_algo == FF_DCT_AUTO ||
316 avctx->dct_algo == FF_DCT_ALTIVEC)
317 {
318 c->fdct = fdct_altivec;
319 }
320 #endif //CONFIG_ENCODERS
321
322 if (avctx->lowres==0)
323 {
324 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
325 (avctx->idct_algo == FF_IDCT_ALTIVEC))
326 {
327 c->idct_put = idct_put_altivec;
328 c->idct_add = idct_add_altivec;
329 #ifndef ALTIVEC_USE_REFERENCE_C_CODE
330 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
331 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
332 c->idct_permutation_type = FF_NO_IDCT_PERM;
333 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
334 }
335 }
336
337 #ifdef POWERPC_PERFORMANCE_REPORT
338 {
339 int i, j;
340 for (i = 0 ; i < powerpc_perf_total ; i++)
341 {
342 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
343 {
344 perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
345 perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
346 perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
347 perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
348 }
349 }
350 }
351 #endif /* POWERPC_PERFORMANCE_REPORT */
352 } else
353 #endif /* HAVE_ALTIVEC */
354 {
355 // Non-AltiVec PPC optimisations
356
357 // ... pending ...
358 }
359 }