Conditionally compile some of the AltiVec optimizations.
[libav.git] / libavcodec / ppc / dsputil_ppc.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
c4a17148 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
05c4072b 5 *
b78e7197
DB
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
05c4072b
MN
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
05c4072b 12 *
b78e7197 13 * FFmpeg is distributed in the hope that it will be useful,
05c4072b
MN
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
b78e7197 19 * License along with FFmpeg; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
05c4072b
MN
21 */
22
ab6c65f6
BF
23#include "../dsputil.h"
24
35e5fb06
RD
25#include "dsputil_ppc.h"
26
ab6c65f6
BF
27#ifdef HAVE_ALTIVEC
28#include "dsputil_altivec.h"
ab6c65f6 29
14cabd40 30extern void fdct_altivec(int16_t *block);
73e4ff9d
LB
31extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
32 int x16, int y16, int rounder);
b0368839
MN
33extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
34extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
50b35f7a
LB
35
36void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
7f624e80
LB
37
38void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
90530cec 39void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
7f624e80 40void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
241807f3 41void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
50b35f7a
LB
42
43#endif
22b48b85 44
486497e0 45int mm_flags = 0;
404d2241 46
e629ab68
RD
47int mm_support(void)
48{
49 int result = 0;
3bbd2123 50#ifdef HAVE_ALTIVEC
e629ab68
RD
51 if (has_altivec()) {
52 result |= MM_ALTIVEC;
53 }
54#endif /* result */
55 return result;
56}
57
e45a2872
RD
58#ifdef POWERPC_PERFORMANCE_REPORT
59unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
fe50f385 60/* list below must match enum in dsputil_ppc.h */
35e5fb06 61static unsigned char* perfname[] = {
68951ecf 62 "ff_fft_calc_altivec",
35e5fb06
RD
63 "gmc1_altivec",
64 "dct_unquantize_h263_altivec",
14cabd40 65 "fdct_altivec",
35e5fb06
RD
66 "idct_add_altivec",
67 "idct_put_altivec",
35e5fb06
RD
68 "put_pixels16_altivec",
69 "avg_pixels16_altivec",
70 "avg_pixels8_altivec",
71 "put_pixels8_xy2_altivec",
fe50f385
RD
72 "put_no_rnd_pixels8_xy2_altivec",
73 "put_pixels16_xy2_altivec",
74 "put_no_rnd_pixels16_xy2_altivec",
c4a17148 75 "hadamard8_diff8x8_altivec",
9007f514 76 "hadamard8_diff16_altivec",
2a5a1bda 77 "avg_pixels8_xy2_altivec",
a4adb608 78 "clear_blocks_dcbz32_ppc",
a6a12a8a
RD
79 "clear_blocks_dcbz128_ppc",
80 "put_h264_chroma_mc8_altivec",
81 "avg_h264_chroma_mc8_altivec",
82 "put_h264_qpel16_h_lowpass_altivec",
83 "avg_h264_qpel16_h_lowpass_altivec",
84 "put_h264_qpel16_v_lowpass_altivec",
85 "avg_h264_qpel16_v_lowpass_altivec",
86 "put_h264_qpel16_hv_lowpass_altivec",
87 "avg_h264_qpel16_hv_lowpass_altivec",
88 ""
35e5fb06 89};
35e5fb06
RD
90#include <stdio.h>
91#endif
92
e45a2872 93#ifdef POWERPC_PERFORMANCE_REPORT
35e5fb06
RD
94void powerpc_display_perf_report(void)
95{
e45a2872 96 int i, j;
aab34ca0 97 av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
35e5fb06
RD
98 for(i = 0 ; i < powerpc_perf_total ; i++)
99 {
e45a2872
RD
100 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
101 {
bb270c08
DB
102 if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
103 av_log(NULL, AV_LOG_INFO,
949b1a13 104 " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
bb270c08
DB
105 perfname[i],
106 j+1,
107 perfdata[j][i][powerpc_data_min],
108 perfdata[j][i][powerpc_data_max],
109 (double)perfdata[j][i][powerpc_data_sum] /
110 (double)perfdata[j][i][powerpc_data_num],
111 perfdata[j][i][powerpc_data_num]);
e45a2872 112 }
35e5fb06
RD
113 }
114}
e45a2872 115#endif /* POWERPC_PERFORMANCE_REPORT */
35e5fb06
RD
116
117/* ***** WARNING ***** WARNING ***** WARNING ***** */
118/*
119 clear_blocks_dcbz32_ppc will not work properly
120 on PowerPC processors with a cache line size
121 not equal to 32 bytes.
122 Fortunately all processor used by Apple up to
123 at least the 7450 (aka second generation G4)
124 use 32 bytes cache line.
125 This is due to the use of the 'dcbz' instruction.
126 It simply clear to zero a single cache line,
127 so you need to know the cache line size to use it !
128 It's absurd, but it's fast...
a4adb608
MN
129
130 update 24/06/2003 : Apple released yesterday the G5,
131 with a PPC970. cache line size : 128 bytes. Oups.
132 The semantic of dcbz was changed, it always clear
133 32 bytes. so the function below will work, but will
134 be slow. So I fixed check_dcbz_effect to use dcbzl,
135 which is defined to clear a cache line (as dcbz before).
136 So we still can distinguish, and use dcbz (32 bytes)
137 or dcbzl (one cache line) as required.
138
139 see <http://developer.apple.com/technotes/tn/tn2087.html>
140 and <http://developer.apple.com/technotes/tn/tn2086.html>
35e5fb06
RD
141*/
142void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
143{
e45a2872 144POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
145 register int misal = ((unsigned long)blocks & 0x00000010);
146 register int i = 0;
e45a2872 147POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
148#if 1
149 if (misal) {
150 ((unsigned long*)blocks)[0] = 0L;
151 ((unsigned long*)blocks)[1] = 0L;
152 ((unsigned long*)blocks)[2] = 0L;
153 ((unsigned long*)blocks)[3] = 0L;
35e5fb06
RD
154 i += 16;
155 }
b1d041c1 156 for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
aab34ca0 157#ifndef __MWERKS__
3efd4952 158 asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
aab34ca0
MN
159#else
160 __dcbz( blocks, i );
161#endif
35e5fb06
RD
162 }
163 if (misal) {
164 ((unsigned long*)blocks)[188] = 0L;
165 ((unsigned long*)blocks)[189] = 0L;
166 ((unsigned long*)blocks)[190] = 0L;
167 ((unsigned long*)blocks)[191] = 0L;
168 i += 16;
169 }
170#else
171 memset(blocks, 0, sizeof(DCTELEM)*6*64);
172#endif
e45a2872 173POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
174}
175
a4adb608
MN
176/* same as above, when dcbzl clear a whole 128B cache line
177 i.e. the PPC970 aka G5 */
87ea51e0 178#ifdef HAVE_DCBZL
a4adb608
MN
179void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
180{
e45a2872 181POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
182 register int misal = ((unsigned long)blocks & 0x0000007f);
183 register int i = 0;
e45a2872 184POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
185#if 1
186 if (misal) {
187 // we could probably also optimize this case,
188 // but there's not much point as the machines
189 // aren't available yet (2003-06-26)
190 memset(blocks, 0, sizeof(DCTELEM)*6*64);
191 }
192 else
193 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
bb270c08 194 asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
a4adb608
MN
195 }
196#else
197 memset(blocks, 0, sizeof(DCTELEM)*6*64);
198#endif
e45a2872 199POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
200}
201#else
202void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
203{
204 memset(blocks, 0, sizeof(DCTELEM)*6*64);
205}
206#endif
207
87ea51e0 208#ifdef HAVE_DCBZL
35e5fb06 209/* check dcbz report how many bytes are set to 0 by dcbz */
a4adb608
MN
210/* update 24/06/2003 : replace dcbz by dcbzl to get
211 the intended effect (Apple "fixed" dcbz)
212 unfortunately this cannot be used unless the assembler
213 knows about dcbzl ... */
214long check_dcbzl_effect(void)
35e5fb06 215{
3b991c54 216 register char *fakedata = (char*)av_malloc(1024);
35e5fb06
RD
217 register char *fakedata_middle;
218 register long zero = 0;
219 register long i = 0;
220 long count = 0;
221
3b991c54 222 if (!fakedata)
35e5fb06
RD
223 {
224 return 0L;
225 }
226
35e5fb06
RD
227 fakedata_middle = (fakedata + 512);
228
229 memset(fakedata, 0xFF, 1024);
230
3efd4952
RD
231 /* below the constraint "b" seems to mean "Address base register"
232 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
233 asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
35e5fb06
RD
234
235 for (i = 0; i < 1024 ; i ++)
236 {
237 if (fakedata[i] == (char)0)
238 count++;
239 }
240
3b991c54 241 av_free(fakedata);
115329f1 242
35e5fb06
RD
243 return count;
244}
a4adb608
MN
245#else
246long check_dcbzl_effect(void)
247{
248 return 0;
249}
250#endif
35e5fb06 251
a5db5bda
LB
252static void prefetch_ppc(void *mem, int stride, int h)
253{
254 register const uint8_t *p = mem;
255 do {
256 asm volatile ("dcbt 0,%0" : : "r" (p));
257 p+= stride;
258 } while(--h);
259}
260
b0368839 261void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
ab6c65f6 262{
a4adb608 263 // Common optimizations whether Altivec is available or not
a5db5bda 264 c->prefetch = prefetch_ppc;
73e4ff9d
LB
265 switch (check_dcbzl_effect()) {
266 case 32:
267 c->clear_blocks = clear_blocks_dcbz32_ppc;
268 break;
269 case 128:
270 c->clear_blocks = clear_blocks_dcbz128_ppc;
271 break;
272 default:
273 break;
274 }
a6a12a8a 275
b1d041c1 276#ifdef HAVE_ALTIVEC
8cff89be 277 if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
115329f1 278
ab6c65f6 279 if (has_altivec()) {
486497e0 280 mm_flags |= MM_ALTIVEC;
115329f1 281
73e4ff9d 282 dsputil_init_altivec(c, avctx);
8cff89be
DB
283 if(ENABLE_SNOW_DECODER) snow_init_altivec(c, avctx);
284 if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
285 vc1dsp_init_altivec(c, avctx);
241807f3 286 float_init_altivec(c, avctx);
bb270c08 287 c->gmc1 = gmc1_altivec;
b0368839 288
14cabd40 289#ifdef CONFIG_ENCODERS
bb270c08
DB
290 if (avctx->dct_algo == FF_DCT_AUTO ||
291 avctx->dct_algo == FF_DCT_ALTIVEC)
292 {
293 c->fdct = fdct_altivec;
294 }
14cabd40
JK
295#endif //CONFIG_ENCODERS
296
73e4ff9d
LB
297 if (avctx->lowres==0)
298 {
b0368839
MN
299 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
300 (avctx->idct_algo == FF_IDCT_ALTIVEC))
301 {
302 c->idct_put = idct_put_altivec;
303 c->idct_add = idct_add_altivec;
b0368839 304 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
73e4ff9d 305 }
b0368839 306 }
115329f1 307
e45a2872 308#ifdef POWERPC_PERFORMANCE_REPORT
db40a39a 309 {
e45a2872 310 int i, j;
35e5fb06 311 for (i = 0 ; i < powerpc_perf_total ; i++)
db40a39a 312 {
bb270c08
DB
313 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
314 {
315 perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
316 perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
317 perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
318 perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
319 }
320 }
db40a39a 321 }
e45a2872 322#endif /* POWERPC_PERFORMANCE_REPORT */
ab6c65f6 323 }
75336fc8 324#endif /* HAVE_ALTIVEC */
ab6c65f6 325}