PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
[libav.git] / libavcodec / ppc / dsputil_ppc.c
1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #include "../dsputil.h"
21
22 #include "dsputil_ppc.h"
23
24 #ifdef HAVE_ALTIVEC
25 #include "dsputil_altivec.h"
26 #endif
27
28 extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
29 extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
30
31 int mm_flags = 0;
32
33 int mm_support(void)
34 {
35 int result = 0;
36 #if HAVE_ALTIVEC
37 if (has_altivec()) {
38 result |= MM_ALTIVEC;
39 }
40 #endif /* result */
41 return result;
42 }
43
44 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
45 unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
46 /* list below must match enum in dsputil_ppc.h */
47 static unsigned char* perfname[] = {
48 "fft_calc_altivec",
49 "gmc1_altivec",
50 "dct_unquantize_h263_altivec",
51 "idct_add_altivec",
52 "idct_put_altivec",
53 "put_pixels16_altivec",
54 "avg_pixels16_altivec",
55 "avg_pixels8_altivec",
56 "put_pixels8_xy2_altivec",
57 "put_no_rnd_pixels8_xy2_altivec",
58 "put_pixels16_xy2_altivec",
59 "put_no_rnd_pixels16_xy2_altivec",
60 "clear_blocks_dcbz32_ppc",
61 "clear_blocks_dcbz128_ppc"
62 };
63 #ifdef POWERPC_PERF_USE_PMC
64 unsigned long long perfdata_pmc2[powerpc_perf_total][powerpc_data_total];
65 unsigned long long perfdata_pmc3[powerpc_perf_total][powerpc_data_total];
66 #endif
67 #include <stdio.h>
68 #endif
69
70 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
71 void powerpc_display_perf_report(void)
72 {
73 int i;
74 #ifndef POWERPC_PERF_USE_PMC
75 fprintf(stderr, "PowerPC performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
76 #else /* POWERPC_PERF_USE_PMC */
77 fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
78 #endif /* POWERPC_PERF_USE_PMC */
79 for(i = 0 ; i < powerpc_perf_total ; i++)
80 {
81 if (perfdata[i][powerpc_data_num] != (unsigned long long)0)
82 fprintf(stderr, " Function \"%s\" (pmc1):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
83 perfname[i],
84 perfdata[i][powerpc_data_min],
85 perfdata[i][powerpc_data_max],
86 (double)perfdata[i][powerpc_data_sum] /
87 (double)perfdata[i][powerpc_data_num],
88 perfdata[i][powerpc_data_num]);
89 #ifdef POWERPC_PERF_USE_PMC
90 if (perfdata_pmc2[i][powerpc_data_num] != (unsigned long long)0)
91 fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
92 perfname[i],
93 perfdata_pmc2[i][powerpc_data_min],
94 perfdata_pmc2[i][powerpc_data_max],
95 (double)perfdata_pmc2[i][powerpc_data_sum] /
96 (double)perfdata_pmc2[i][powerpc_data_num],
97 perfdata_pmc2[i][powerpc_data_num]);
98 if (perfdata_pmc3[i][powerpc_data_num] != (unsigned long long)0)
99 fprintf(stderr, " Function \"%s\" (pmc3):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
100 perfname[i],
101 perfdata_pmc3[i][powerpc_data_min],
102 perfdata_pmc3[i][powerpc_data_max],
103 (double)perfdata_pmc3[i][powerpc_data_sum] /
104 (double)perfdata_pmc3[i][powerpc_data_num],
105 perfdata_pmc3[i][powerpc_data_num]);
106 #endif
107 }
108 }
109 #endif /* POWERPC_TBL_PERFORMANCE_REPORT */
110
111 /* ***** WARNING ***** WARNING ***** WARNING ***** */
112 /*
113 clear_blocks_dcbz32_ppc will not work properly
114 on PowerPC processors with a cache line size
115 not equal to 32 bytes.
116 Fortunately all processor used by Apple up to
117 at least the 7450 (aka second generation G4)
118 use 32 bytes cache line.
119 This is due to the use of the 'dcbz' instruction.
120 It simply clear to zero a single cache line,
121 so you need to know the cache line size to use it !
122 It's absurd, but it's fast...
123
124 update 24/06/2003 : Apple released yesterday the G5,
125 with a PPC970. cache line size : 128 bytes. Oups.
126 The semantic of dcbz was changed, it always clear
127 32 bytes. so the function below will work, but will
128 be slow. So I fixed check_dcbz_effect to use dcbzl,
129 which is defined to clear a cache line (as dcbz before).
130 So we still can distinguish, and use dcbz (32 bytes)
131 or dcbzl (one cache line) as required.
132
133 see <http://developer.apple.com/technotes/tn/tn2087.html>
134 and <http://developer.apple.com/technotes/tn/tn2086.html>
135 */
136 void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
137 {
138 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
139 register int misal = ((unsigned long)blocks & 0x00000010);
140 register int i = 0;
141 POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
142 #if 1
143 if (misal) {
144 ((unsigned long*)blocks)[0] = 0L;
145 ((unsigned long*)blocks)[1] = 0L;
146 ((unsigned long*)blocks)[2] = 0L;
147 ((unsigned long*)blocks)[3] = 0L;
148 i += 16;
149 }
150 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
151 asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
152 }
153 if (misal) {
154 ((unsigned long*)blocks)[188] = 0L;
155 ((unsigned long*)blocks)[189] = 0L;
156 ((unsigned long*)blocks)[190] = 0L;
157 ((unsigned long*)blocks)[191] = 0L;
158 i += 16;
159 }
160 #else
161 memset(blocks, 0, sizeof(DCTELEM)*6*64);
162 #endif
163 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
164 }
165
166 /* same as above, when dcbzl clear a whole 128B cache line
167 i.e. the PPC970 aka G5 */
168 #ifndef NO_DCBZL
169 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
170 {
171 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz128, 1);
172 register int misal = ((unsigned long)blocks & 0x0000007f);
173 register int i = 0;
174 POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
175 #if 1
176 if (misal) {
177 // we could probably also optimize this case,
178 // but there's not much point as the machines
179 // aren't available yet (2003-06-26)
180 memset(blocks, 0, sizeof(DCTELEM)*6*64);
181 }
182 else
183 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
184 asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
185 }
186 #else
187 memset(blocks, 0, sizeof(DCTELEM)*6*64);
188 #endif
189 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
190 }
191 #else
192 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
193 {
194 memset(blocks, 0, sizeof(DCTELEM)*6*64);
195 }
196 #endif
197
198 #ifndef NO_DCBZL
199 /* check dcbz report how many bytes are set to 0 by dcbz */
200 /* update 24/06/2003 : replace dcbz by dcbzl to get
201 the intended effect (Apple "fixed" dcbz)
202 unfortunately this cannot be used unless the assembler
203 knows about dcbzl ... */
204 long check_dcbzl_effect(void)
205 {
206 register char *fakedata = (char*)av_malloc(1024);
207 register char *fakedata_middle;
208 register long zero = 0;
209 register long i = 0;
210 long count = 0;
211
212 if (!fakedata)
213 {
214 return 0L;
215 }
216
217 fakedata_middle = (fakedata + 512);
218
219 memset(fakedata, 0xFF, 1024);
220
221 /* below the constraint "b" seems to mean "Address base register"
222 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
223 asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
224
225 for (i = 0; i < 1024 ; i ++)
226 {
227 if (fakedata[i] == (char)0)
228 count++;
229 }
230
231 av_free(fakedata);
232
233 return count;
234 }
235 #else
236 long check_dcbzl_effect(void)
237 {
238 return 0;
239 }
240 #endif
241
242 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
243 {
244 // Common optimizations whether Altivec is available or not
245
246 switch (check_dcbzl_effect()) {
247 case 32:
248 c->clear_blocks = clear_blocks_dcbz32_ppc;
249 break;
250 case 128:
251 c->clear_blocks = clear_blocks_dcbz128_ppc;
252 break;
253 default:
254 break;
255 }
256
257 #if HAVE_ALTIVEC
258 if (has_altivec()) {
259 mm_flags |= MM_ALTIVEC;
260
261 // Altivec specific optimisations
262 c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
263 c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
264 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
265 c->pix_abs16x16 = pix_abs16x16_altivec;
266 c->pix_abs8x8 = pix_abs8x8_altivec;
267 c->sad[0]= sad16x16_altivec;
268 c->sad[1]= sad8x8_altivec;
269 c->pix_norm1 = pix_norm1_altivec;
270 c->sse[1]= sse8_altivec;
271 c->sse[0]= sse16_altivec;
272 c->pix_sum = pix_sum_altivec;
273 c->diff_pixels = diff_pixels_altivec;
274 c->get_pixels = get_pixels_altivec;
275 // next one disabled as it's untested.
276 #if 0
277 c->add_bytes= add_bytes_altivec;
278 #endif /* 0 */
279 c->put_pixels_tab[0][0] = put_pixels16_altivec;
280 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
281 // next one disabled as it's untested.
282 #if 0
283 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
284 #endif /* 0 */
285 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
286 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
287 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
288 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
289
290 c->gmc1 = gmc1_altivec;
291
292 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
293 (avctx->idct_algo == FF_IDCT_ALTIVEC))
294 {
295 c->idct_put = idct_put_altivec;
296 c->idct_add = idct_add_altivec;
297 #ifndef ALTIVEC_USE_REFERENCE_C_CODE
298 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
299 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
300 c->idct_permutation_type = FF_NO_IDCT_PERM;
301 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
302 }
303
304 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
305 {
306 int i;
307 for (i = 0 ; i < powerpc_perf_total ; i++)
308 {
309 perfdata[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
310 perfdata[i][powerpc_data_max] = 0x0000000000000000;
311 perfdata[i][powerpc_data_sum] = 0x0000000000000000;
312 perfdata[i][powerpc_data_num] = 0x0000000000000000;
313 #ifdef POWERPC_PERF_USE_PMC
314 perfdata_pmc2[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
315 perfdata_pmc2[i][powerpc_data_max] = 0x0000000000000000;
316 perfdata_pmc2[i][powerpc_data_sum] = 0x0000000000000000;
317 perfdata_pmc2[i][powerpc_data_num] = 0x0000000000000000;
318 perfdata_pmc3[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
319 perfdata_pmc3[i][powerpc_data_max] = 0x0000000000000000;
320 perfdata_pmc3[i][powerpc_data_sum] = 0x0000000000000000;
321 perfdata_pmc3[i][powerpc_data_num] = 0x0000000000000000;
322 #endif /* POWERPC_PERF_USE_PMC */
323 }
324 }
325 #endif /* POWERPC_TBL_PERFORMANCE_REPORT */
326 } else
327 #endif /* HAVE_ALTIVEC */
328 {
329 // Non-AltiVec PPC optimisations
330
331 // ... pending ...
332 }
333 }