hadamard8_diff* enabled on linux/ppc
[libav.git] / libavcodec / ppc / dsputil_ppc.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
c4a17148 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
05c4072b
MN
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
05c4072b
MN
19 */
20
ab6c65f6
BF
21#include "../dsputil.h"
22
35e5fb06
RD
23#include "dsputil_ppc.h"
24
ab6c65f6
BF
25#ifdef HAVE_ALTIVEC
26#include "dsputil_altivec.h"
27#endif
28
14cabd40 29extern void fdct_altivec(int16_t *block);
b0368839
MN
30extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
31extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
32
22b48b85
LB
33extern void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width);
34extern void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1,
35 DWTELEM *b2, DWTELEM *b3,
36 DWTELEM *b4, DWTELEM *b5,
37 int width);
38extern void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
39 uint8_t * * block, int b_w, int b_h,
40 int src_x, int src_y, int src_stride,
41 slice_buffer * sb, int add,
42 uint8_t * dst8);
43
404d2241
BF
44int mm_flags = 0;
45
e629ab68
RD
46int mm_support(void)
47{
48 int result = 0;
3bbd2123 49#ifdef HAVE_ALTIVEC
e629ab68
RD
50 if (has_altivec()) {
51 result |= MM_ALTIVEC;
52 }
53#endif /* result */
54 return result;
55}
56
e45a2872
RD
57#ifdef POWERPC_PERFORMANCE_REPORT
58unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
fe50f385 59/* list below must match enum in dsputil_ppc.h */
35e5fb06 60static unsigned char* perfname[] = {
68951ecf 61 "ff_fft_calc_altivec",
35e5fb06
RD
62 "gmc1_altivec",
63 "dct_unquantize_h263_altivec",
14cabd40 64 "fdct_altivec",
35e5fb06
RD
65 "idct_add_altivec",
66 "idct_put_altivec",
35e5fb06
RD
67 "put_pixels16_altivec",
68 "avg_pixels16_altivec",
69 "avg_pixels8_altivec",
70 "put_pixels8_xy2_altivec",
fe50f385
RD
71 "put_no_rnd_pixels8_xy2_altivec",
72 "put_pixels16_xy2_altivec",
73 "put_no_rnd_pixels16_xy2_altivec",
c4a17148 74 "hadamard8_diff8x8_altivec",
9007f514 75 "hadamard8_diff16_altivec",
2a5a1bda 76 "avg_pixels8_xy2_altivec",
a4adb608 77 "clear_blocks_dcbz32_ppc",
a6a12a8a
RD
78 "clear_blocks_dcbz128_ppc",
79 "put_h264_chroma_mc8_altivec",
80 "avg_h264_chroma_mc8_altivec",
81 "put_h264_qpel16_h_lowpass_altivec",
82 "avg_h264_qpel16_h_lowpass_altivec",
83 "put_h264_qpel16_v_lowpass_altivec",
84 "avg_h264_qpel16_v_lowpass_altivec",
85 "put_h264_qpel16_hv_lowpass_altivec",
86 "avg_h264_qpel16_hv_lowpass_altivec",
87 ""
35e5fb06 88};
35e5fb06
RD
89#include <stdio.h>
90#endif
91
e45a2872 92#ifdef POWERPC_PERFORMANCE_REPORT
35e5fb06
RD
93void powerpc_display_perf_report(void)
94{
e45a2872 95 int i, j;
aab34ca0 96 av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
35e5fb06
RD
97 for(i = 0 ; i < powerpc_perf_total ; i++)
98 {
e45a2872
RD
99 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
100 {
bb270c08
DB
101 if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
102 av_log(NULL, AV_LOG_INFO,
103 " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
104 perfname[i],
105 j+1,
106 perfdata[j][i][powerpc_data_min],
107 perfdata[j][i][powerpc_data_max],
108 (double)perfdata[j][i][powerpc_data_sum] /
109 (double)perfdata[j][i][powerpc_data_num],
110 perfdata[j][i][powerpc_data_num]);
e45a2872 111 }
35e5fb06
RD
112 }
113}
e45a2872 114#endif /* POWERPC_PERFORMANCE_REPORT */
35e5fb06
RD
115
116/* ***** WARNING ***** WARNING ***** WARNING ***** */
117/*
118 clear_blocks_dcbz32_ppc will not work properly
119 on PowerPC processors with a cache line size
120 not equal to 32 bytes.
121 Fortunately all processor used by Apple up to
122 at least the 7450 (aka second generation G4)
123 use 32 bytes cache line.
124 This is due to the use of the 'dcbz' instruction.
125 It simply clear to zero a single cache line,
126 so you need to know the cache line size to use it !
127 It's absurd, but it's fast...
a4adb608
MN
128
129 update 24/06/2003 : Apple released yesterday the G5,
130 with a PPC970. cache line size : 128 bytes. Oups.
131 The semantic of dcbz was changed, it always clear
132 32 bytes. so the function below will work, but will
133 be slow. So I fixed check_dcbz_effect to use dcbzl,
134 which is defined to clear a cache line (as dcbz before).
135 So we still can distinguish, and use dcbz (32 bytes)
136 or dcbzl (one cache line) as required.
137
138 see <http://developer.apple.com/technotes/tn/tn2087.html>
139 and <http://developer.apple.com/technotes/tn/tn2086.html>
35e5fb06
RD
140*/
141void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
142{
e45a2872 143POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
144 register int misal = ((unsigned long)blocks & 0x00000010);
145 register int i = 0;
e45a2872 146POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
147#if 1
148 if (misal) {
149 ((unsigned long*)blocks)[0] = 0L;
150 ((unsigned long*)blocks)[1] = 0L;
151 ((unsigned long*)blocks)[2] = 0L;
152 ((unsigned long*)blocks)[3] = 0L;
35e5fb06
RD
153 i += 16;
154 }
b1d041c1 155 for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
aab34ca0 156#ifndef __MWERKS__
3efd4952 157 asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
aab34ca0
MN
158#else
159 __dcbz( blocks, i );
160#endif
35e5fb06
RD
161 }
162 if (misal) {
163 ((unsigned long*)blocks)[188] = 0L;
164 ((unsigned long*)blocks)[189] = 0L;
165 ((unsigned long*)blocks)[190] = 0L;
166 ((unsigned long*)blocks)[191] = 0L;
167 i += 16;
168 }
169#else
170 memset(blocks, 0, sizeof(DCTELEM)*6*64);
171#endif
e45a2872 172POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
173}
174
a4adb608
MN
175/* same as above, when dcbzl clear a whole 128B cache line
176 i.e. the PPC970 aka G5 */
177#ifndef NO_DCBZL
178void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
179{
e45a2872 180POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
181 register int misal = ((unsigned long)blocks & 0x0000007f);
182 register int i = 0;
e45a2872 183POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
184#if 1
185 if (misal) {
186 // we could probably also optimize this case,
187 // but there's not much point as the machines
188 // aren't available yet (2003-06-26)
189 memset(blocks, 0, sizeof(DCTELEM)*6*64);
190 }
191 else
192 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
bb270c08 193 asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
a4adb608
MN
194 }
195#else
196 memset(blocks, 0, sizeof(DCTELEM)*6*64);
197#endif
e45a2872 198POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
199}
200#else
201void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
202{
203 memset(blocks, 0, sizeof(DCTELEM)*6*64);
204}
205#endif
206
207#ifndef NO_DCBZL
35e5fb06 208/* check dcbz report how many bytes are set to 0 by dcbz */
a4adb608
MN
209/* update 24/06/2003 : replace dcbz by dcbzl to get
210 the intended effect (Apple "fixed" dcbz)
211 unfortunately this cannot be used unless the assembler
212 knows about dcbzl ... */
213long check_dcbzl_effect(void)
35e5fb06 214{
3b991c54 215 register char *fakedata = (char*)av_malloc(1024);
35e5fb06
RD
216 register char *fakedata_middle;
217 register long zero = 0;
218 register long i = 0;
219 long count = 0;
220
3b991c54 221 if (!fakedata)
35e5fb06
RD
222 {
223 return 0L;
224 }
225
35e5fb06
RD
226 fakedata_middle = (fakedata + 512);
227
228 memset(fakedata, 0xFF, 1024);
229
3efd4952
RD
230 /* below the constraint "b" seems to mean "Address base register"
231 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
232 asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
35e5fb06
RD
233
234 for (i = 0; i < 1024 ; i ++)
235 {
236 if (fakedata[i] == (char)0)
237 count++;
238 }
239
3b991c54 240 av_free(fakedata);
115329f1 241
35e5fb06
RD
242 return count;
243}
a4adb608
MN
244#else
245long check_dcbzl_effect(void)
246{
247 return 0;
248}
249#endif
35e5fb06 250
a6a12a8a
RD
251
252void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
253
b0368839 254void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
ab6c65f6 255{
a4adb608 256 // Common optimizations whether Altivec is available or not
05c4072b 257
a4adb608 258 switch (check_dcbzl_effect()) {
35e5fb06
RD
259 case 32:
260 c->clear_blocks = clear_blocks_dcbz32_ppc;
261 break;
a4adb608
MN
262 case 128:
263 c->clear_blocks = clear_blocks_dcbz128_ppc;
264 break;
35e5fb06
RD
265 default:
266 break;
267 }
a6a12a8a 268
b1d041c1 269#ifdef HAVE_ALTIVEC
a6a12a8a 270 dsputil_h264_init_ppc(c, avctx);
115329f1 271
ab6c65f6 272 if (has_altivec()) {
404d2241 273 mm_flags |= MM_ALTIVEC;
115329f1 274
05c4072b 275 // Altivec specific optimisations
bb198e19
MN
276 c->pix_abs[0][1] = sad16_x2_altivec;
277 c->pix_abs[0][2] = sad16_y2_altivec;
278 c->pix_abs[0][3] = sad16_xy2_altivec;
279 c->pix_abs[0][0] = sad16_altivec;
280 c->pix_abs[1][0] = sad8_altivec;
281 c->sad[0]= sad16_altivec;
282 c->sad[1]= sad8_altivec;
f2677d6b 283 c->pix_norm1 = pix_norm1_altivec;
4013fcf4
FB
284 c->sse[1]= sse8_altivec;
285 c->sse[0]= sse16_altivec;
af19f78f
ZK
286 c->pix_sum = pix_sum_altivec;
287 c->diff_pixels = diff_pixels_altivec;
288 c->get_pixels = get_pixels_altivec;
fe50f385 289// next one disabled as it's untested.
e629ab68
RD
290#if 0
291 c->add_bytes= add_bytes_altivec;
fe50f385 292#endif /* 0 */
db40a39a 293 c->put_pixels_tab[0][0] = put_pixels16_altivec;
c4a17148 294 /* the two functions do the same thing, so use the same code */
e45a2872 295 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
db40a39a 296 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
35e5fb06 297 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
bb270c08 298 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
35e5fb06 299 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
fe50f385
RD
300 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
301 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
302 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
115329f1 303
bb270c08 304 c->gmc1 = gmc1_altivec;
b0368839 305
bb270c08
DB
306 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
307 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
c4a17148 308
22b48b85
LB
309
310 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
311 c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
312 c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
313
14cabd40 314#ifdef CONFIG_ENCODERS
bb270c08
DB
315 if (avctx->dct_algo == FF_DCT_AUTO ||
316 avctx->dct_algo == FF_DCT_ALTIVEC)
317 {
318 c->fdct = fdct_altivec;
319 }
14cabd40
JK
320#endif //CONFIG_ENCODERS
321
4af5b6cd
SS
322 if (avctx->lowres==0)
323 {
b0368839
MN
324 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
325 (avctx->idct_algo == FF_IDCT_ALTIVEC))
326 {
327 c->idct_put = idct_put_altivec;
328 c->idct_add = idct_add_altivec;
329#ifndef ALTIVEC_USE_REFERENCE_C_CODE
330 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
331#else /* ALTIVEC_USE_REFERENCE_C_CODE */
332 c->idct_permutation_type = FF_NO_IDCT_PERM;
333#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
334 }
4af5b6cd 335 }
115329f1 336
e45a2872 337#ifdef POWERPC_PERFORMANCE_REPORT
db40a39a 338 {
e45a2872 339 int i, j;
35e5fb06 340 for (i = 0 ; i < powerpc_perf_total ; i++)
db40a39a 341 {
bb270c08
DB
342 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
343 {
344 perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
345 perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
346 perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
347 perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
348 }
349 }
db40a39a 350 }
e45a2872 351#endif /* POWERPC_PERFORMANCE_REPORT */
ab6c65f6 352 } else
fe50f385 353#endif /* HAVE_ALTIVEC */
ab6c65f6 354 {
05c4072b
MN
355 // Non-AltiVec PPC optimisations
356
357 // ... pending ...
ab6c65f6
BF
358 }
359}