Commit | Line | Data |
---|---|---|
05c4072b MN |
1 | /* |
2 | * Copyright (c) 2002 Brian Foley | |
3 | * Copyright (c) 2002 Dieter Shirley | |
c4a17148 | 4 | * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
05c4072b MN |
5 | * |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, write to the Free Software | |
5509bffa | 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
05c4072b MN |
19 | */ |
20 | ||
ab6c65f6 BF |
21 | #include "../dsputil.h" |
22 | ||
35e5fb06 RD |
23 | #include "dsputil_ppc.h" |
24 | ||
ab6c65f6 BF |
25 | #ifdef HAVE_ALTIVEC |
26 | #include "dsputil_altivec.h" | |
27 | #endif | |
28 | ||
14cabd40 | 29 | extern void fdct_altivec(int16_t *block); |
b0368839 MN |
30 | extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); |
31 | extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); | |
32 | ||
22b48b85 LB |
33 | extern void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width); |
34 | extern void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, | |
35 | DWTELEM *b2, DWTELEM *b3, | |
36 | DWTELEM *b4, DWTELEM *b5, | |
37 | int width); | |
38 | extern void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, | |
39 | uint8_t * * block, int b_w, int b_h, | |
40 | int src_x, int src_y, int src_stride, | |
41 | slice_buffer * sb, int add, | |
42 | uint8_t * dst8); | |
43 | ||
404d2241 BF |
44 | int mm_flags = 0; |
45 | ||
e629ab68 RD |
46 | int mm_support(void) |
47 | { | |
48 | int result = 0; | |
3bbd2123 | 49 | #ifdef HAVE_ALTIVEC |
e629ab68 RD |
50 | if (has_altivec()) { |
51 | result |= MM_ALTIVEC; | |
52 | } | |
53 | #endif /* result */ | |
54 | return result; | |
55 | } | |
56 | ||
e45a2872 RD |
57 | #ifdef POWERPC_PERFORMANCE_REPORT |
58 | unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |
fe50f385 | 59 | /* list below must match enum in dsputil_ppc.h */ |
35e5fb06 | 60 | static unsigned char* perfname[] = { |
68951ecf | 61 | "ff_fft_calc_altivec", |
35e5fb06 RD |
62 | "gmc1_altivec", |
63 | "dct_unquantize_h263_altivec", | |
14cabd40 | 64 | "fdct_altivec", |
35e5fb06 RD |
65 | "idct_add_altivec", |
66 | "idct_put_altivec", | |
35e5fb06 RD |
67 | "put_pixels16_altivec", |
68 | "avg_pixels16_altivec", | |
69 | "avg_pixels8_altivec", | |
70 | "put_pixels8_xy2_altivec", | |
fe50f385 RD |
71 | "put_no_rnd_pixels8_xy2_altivec", |
72 | "put_pixels16_xy2_altivec", | |
73 | "put_no_rnd_pixels16_xy2_altivec", | |
c4a17148 | 74 | "hadamard8_diff8x8_altivec", |
9007f514 | 75 | "hadamard8_diff16_altivec", |
2a5a1bda | 76 | "avg_pixels8_xy2_altivec", |
a4adb608 | 77 | "clear_blocks_dcbz32_ppc", |
a6a12a8a RD |
78 | "clear_blocks_dcbz128_ppc", |
79 | "put_h264_chroma_mc8_altivec", | |
80 | "avg_h264_chroma_mc8_altivec", | |
81 | "put_h264_qpel16_h_lowpass_altivec", | |
82 | "avg_h264_qpel16_h_lowpass_altivec", | |
83 | "put_h264_qpel16_v_lowpass_altivec", | |
84 | "avg_h264_qpel16_v_lowpass_altivec", | |
85 | "put_h264_qpel16_hv_lowpass_altivec", | |
86 | "avg_h264_qpel16_hv_lowpass_altivec", | |
87 | "" | |
35e5fb06 | 88 | }; |
35e5fb06 RD |
89 | #include <stdio.h> |
90 | #endif | |
91 | ||
e45a2872 | 92 | #ifdef POWERPC_PERFORMANCE_REPORT |
35e5fb06 RD |
93 | void powerpc_display_perf_report(void) |
94 | { | |
e45a2872 | 95 | int i, j; |
aab34ca0 | 96 | av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); |
35e5fb06 RD |
97 | for(i = 0 ; i < powerpc_perf_total ; i++) |
98 | { | |
e45a2872 RD |
99 | for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) |
100 | { | |
bb270c08 DB |
101 | if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) |
102 | av_log(NULL, AV_LOG_INFO, | |
103 | " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", | |
104 | perfname[i], | |
105 | j+1, | |
106 | perfdata[j][i][powerpc_data_min], | |
107 | perfdata[j][i][powerpc_data_max], | |
108 | (double)perfdata[j][i][powerpc_data_sum] / | |
109 | (double)perfdata[j][i][powerpc_data_num], | |
110 | perfdata[j][i][powerpc_data_num]); | |
e45a2872 | 111 | } |
35e5fb06 RD |
112 | } |
113 | } | |
e45a2872 | 114 | #endif /* POWERPC_PERFORMANCE_REPORT */ |
35e5fb06 RD |
115 | |
116 | /* ***** WARNING ***** WARNING ***** WARNING ***** */ | |
117 | /* | |
118 | clear_blocks_dcbz32_ppc will not work properly | |
119 | on PowerPC processors with a cache line size | |
120 | not equal to 32 bytes. | |
121 | Fortunately all processor used by Apple up to | |
122 | at least the 7450 (aka second generation G4) | |
123 | use 32 bytes cache line. | |
124 | This is due to the use of the 'dcbz' instruction. | |
125 | It simply clear to zero a single cache line, | |
126 | so you need to know the cache line size to use it ! | |
127 | It's absurd, but it's fast... | |
a4adb608 MN |
128 | |
129 | update 24/06/2003 : Apple released yesterday the G5, | |
130 | with a PPC970. cache line size : 128 bytes. Oups. | |
131 | The semantic of dcbz was changed, it always clear | |
132 | 32 bytes. so the function below will work, but will | |
133 | be slow. So I fixed check_dcbz_effect to use dcbzl, | |
134 | which is defined to clear a cache line (as dcbz before). | |
135 | So we still can distinguish, and use dcbz (32 bytes) | |
136 | or dcbzl (one cache line) as required. | |
137 | ||
138 | see <http://developer.apple.com/technotes/tn/tn2087.html> | |
139 | and <http://developer.apple.com/technotes/tn/tn2086.html> | |
35e5fb06 RD |
140 | */ |
141 | void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | |
142 | { | |
e45a2872 | 143 | POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); |
35e5fb06 RD |
144 | register int misal = ((unsigned long)blocks & 0x00000010); |
145 | register int i = 0; | |
e45a2872 | 146 | POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); |
35e5fb06 RD |
147 | #if 1 |
148 | if (misal) { | |
149 | ((unsigned long*)blocks)[0] = 0L; | |
150 | ((unsigned long*)blocks)[1] = 0L; | |
151 | ((unsigned long*)blocks)[2] = 0L; | |
152 | ((unsigned long*)blocks)[3] = 0L; | |
35e5fb06 RD |
153 | i += 16; |
154 | } | |
b1d041c1 | 155 | for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { |
aab34ca0 | 156 | #ifndef __MWERKS__ |
3efd4952 | 157 | asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); |
aab34ca0 MN |
158 | #else |
159 | __dcbz( blocks, i ); | |
160 | #endif | |
35e5fb06 RD |
161 | } |
162 | if (misal) { | |
163 | ((unsigned long*)blocks)[188] = 0L; | |
164 | ((unsigned long*)blocks)[189] = 0L; | |
165 | ((unsigned long*)blocks)[190] = 0L; | |
166 | ((unsigned long*)blocks)[191] = 0L; | |
167 | i += 16; | |
168 | } | |
169 | #else | |
170 | memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
171 | #endif | |
e45a2872 | 172 | POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); |
35e5fb06 RD |
173 | } |
174 | ||
a4adb608 MN |
175 | /* same as above, when dcbzl clear a whole 128B cache line |
176 | i.e. the PPC970 aka G5 */ | |
177 | #ifndef NO_DCBZL | |
178 | void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
179 | { | |
e45a2872 | 180 | POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); |
a4adb608 MN |
181 | register int misal = ((unsigned long)blocks & 0x0000007f); |
182 | register int i = 0; | |
e45a2872 | 183 | POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); |
a4adb608 MN |
184 | #if 1 |
185 | if (misal) { | |
186 | // we could probably also optimize this case, | |
187 | // but there's not much point as the machines | |
188 | // aren't available yet (2003-06-26) | |
189 | memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
190 | } | |
191 | else | |
192 | for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { | |
bb270c08 | 193 | asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); |
a4adb608 MN |
194 | } |
195 | #else | |
196 | memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
197 | #endif | |
e45a2872 | 198 | POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); |
a4adb608 MN |
199 | } |
200 | #else | |
201 | void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
202 | { | |
203 | memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
204 | } | |
205 | #endif | |
206 | ||
207 | #ifndef NO_DCBZL | |
35e5fb06 | 208 | /* check dcbz report how many bytes are set to 0 by dcbz */ |
a4adb608 MN |
209 | /* update 24/06/2003 : replace dcbz by dcbzl to get |
210 | the intended effect (Apple "fixed" dcbz) | |
211 | unfortunately this cannot be used unless the assembler | |
212 | knows about dcbzl ... */ | |
213 | long check_dcbzl_effect(void) | |
35e5fb06 | 214 | { |
3b991c54 | 215 | register char *fakedata = (char*)av_malloc(1024); |
35e5fb06 RD |
216 | register char *fakedata_middle; |
217 | register long zero = 0; | |
218 | register long i = 0; | |
219 | long count = 0; | |
220 | ||
3b991c54 | 221 | if (!fakedata) |
35e5fb06 RD |
222 | { |
223 | return 0L; | |
224 | } | |
225 | ||
35e5fb06 RD |
226 | fakedata_middle = (fakedata + 512); |
227 | ||
228 | memset(fakedata, 0xFF, 1024); | |
229 | ||
3efd4952 RD |
230 | /* below the constraint "b" seems to mean "Address base register" |
231 | in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ | |
232 | asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); | |
35e5fb06 RD |
233 | |
234 | for (i = 0; i < 1024 ; i ++) | |
235 | { | |
236 | if (fakedata[i] == (char)0) | |
237 | count++; | |
238 | } | |
239 | ||
3b991c54 | 240 | av_free(fakedata); |
115329f1 | 241 | |
35e5fb06 RD |
242 | return count; |
243 | } | |
a4adb608 MN |
244 | #else |
245 | long check_dcbzl_effect(void) | |
246 | { | |
247 | return 0; | |
248 | } | |
249 | #endif | |
35e5fb06 | 250 | |
a6a12a8a RD |
251 | |
252 | void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx); | |
253 | ||
b0368839 | 254 | void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) |
ab6c65f6 | 255 | { |
a4adb608 | 256 | // Common optimizations whether Altivec is available or not |
05c4072b | 257 | |
a4adb608 | 258 | switch (check_dcbzl_effect()) { |
35e5fb06 RD |
259 | case 32: |
260 | c->clear_blocks = clear_blocks_dcbz32_ppc; | |
261 | break; | |
a4adb608 MN |
262 | case 128: |
263 | c->clear_blocks = clear_blocks_dcbz128_ppc; | |
264 | break; | |
35e5fb06 RD |
265 | default: |
266 | break; | |
267 | } | |
a6a12a8a | 268 | |
b1d041c1 | 269 | #ifdef HAVE_ALTIVEC |
a6a12a8a | 270 | dsputil_h264_init_ppc(c, avctx); |
115329f1 | 271 | |
ab6c65f6 | 272 | if (has_altivec()) { |
404d2241 | 273 | mm_flags |= MM_ALTIVEC; |
115329f1 | 274 | |
05c4072b | 275 | // Altivec specific optimisations |
bb198e19 MN |
276 | c->pix_abs[0][1] = sad16_x2_altivec; |
277 | c->pix_abs[0][2] = sad16_y2_altivec; | |
278 | c->pix_abs[0][3] = sad16_xy2_altivec; | |
279 | c->pix_abs[0][0] = sad16_altivec; | |
280 | c->pix_abs[1][0] = sad8_altivec; | |
281 | c->sad[0]= sad16_altivec; | |
282 | c->sad[1]= sad8_altivec; | |
f2677d6b | 283 | c->pix_norm1 = pix_norm1_altivec; |
4013fcf4 FB |
284 | c->sse[1]= sse8_altivec; |
285 | c->sse[0]= sse16_altivec; | |
af19f78f ZK |
286 | c->pix_sum = pix_sum_altivec; |
287 | c->diff_pixels = diff_pixels_altivec; | |
288 | c->get_pixels = get_pixels_altivec; | |
fe50f385 | 289 | // next one disabled as it's untested. |
e629ab68 RD |
290 | #if 0 |
291 | c->add_bytes= add_bytes_altivec; | |
fe50f385 | 292 | #endif /* 0 */ |
db40a39a | 293 | c->put_pixels_tab[0][0] = put_pixels16_altivec; |
c4a17148 | 294 | /* the two functions do the same thing, so use the same code */ |
e45a2872 | 295 | c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; |
db40a39a | 296 | c->avg_pixels_tab[0][0] = avg_pixels16_altivec; |
35e5fb06 | 297 | c->avg_pixels_tab[1][0] = avg_pixels8_altivec; |
bb270c08 | 298 | c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; |
35e5fb06 | 299 | c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; |
fe50f385 RD |
300 | c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; |
301 | c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; | |
302 | c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; | |
115329f1 | 303 | |
bb270c08 | 304 | c->gmc1 = gmc1_altivec; |
b0368839 | 305 | |
bb270c08 DB |
306 | c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
307 | c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; | |
c4a17148 | 308 | |
22b48b85 LB |
309 | |
310 | c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; | |
311 | c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; | |
312 | c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; | |
313 | ||
14cabd40 | 314 | #ifdef CONFIG_ENCODERS |
bb270c08 DB |
315 | if (avctx->dct_algo == FF_DCT_AUTO || |
316 | avctx->dct_algo == FF_DCT_ALTIVEC) | |
317 | { | |
318 | c->fdct = fdct_altivec; | |
319 | } | |
14cabd40 JK |
320 | #endif //CONFIG_ENCODERS |
321 | ||
4af5b6cd SS |
322 | if (avctx->lowres==0) |
323 | { | |
b0368839 MN |
324 | if ((avctx->idct_algo == FF_IDCT_AUTO) || |
325 | (avctx->idct_algo == FF_IDCT_ALTIVEC)) | |
326 | { | |
327 | c->idct_put = idct_put_altivec; | |
328 | c->idct_add = idct_add_altivec; | |
329 | #ifndef ALTIVEC_USE_REFERENCE_C_CODE | |
330 | c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |
331 | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
332 | c->idct_permutation_type = FF_NO_IDCT_PERM; | |
333 | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
334 | } | |
4af5b6cd | 335 | } |
115329f1 | 336 | |
e45a2872 | 337 | #ifdef POWERPC_PERFORMANCE_REPORT |
db40a39a | 338 | { |
e45a2872 | 339 | int i, j; |
35e5fb06 | 340 | for (i = 0 ; i < powerpc_perf_total ; i++) |
db40a39a | 341 | { |
bb270c08 DB |
342 | for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) |
343 | { | |
344 | perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; | |
345 | perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; | |
346 | perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; | |
347 | perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; | |
348 | } | |
349 | } | |
db40a39a | 350 | } |
e45a2872 | 351 | #endif /* POWERPC_PERFORMANCE_REPORT */ |
ab6c65f6 | 352 | } else |
fe50f385 | 353 | #endif /* HAVE_ALTIVEC */ |
ab6c65f6 | 354 | { |
05c4072b MN |
355 | // Non-AltiVec PPC optimisations |
356 | ||
357 | // ... pending ... | |
ab6c65f6 BF |
358 | } |
359 | } |