interlaced motion estimation
[libav.git] / libavcodec / ppc / dsputil_ppc.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
ab6c65f6
BF
20#include "../dsputil.h"
21
35e5fb06
RD
22#include "dsputil_ppc.h"
23
ab6c65f6
BF
24#ifdef HAVE_ALTIVEC
25#include "dsputil_altivec.h"
26#endif
27
14cabd40 28extern void fdct_altivec(int16_t *block);
b0368839
MN
29extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
30extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
31
404d2241
BF
32int mm_flags = 0;
33
e629ab68
RD
34int mm_support(void)
35{
36 int result = 0;
3bbd2123 37#ifdef HAVE_ALTIVEC
e629ab68
RD
38 if (has_altivec()) {
39 result |= MM_ALTIVEC;
40 }
41#endif /* result */
42 return result;
43}
44
e45a2872
RD
45#ifdef POWERPC_PERFORMANCE_REPORT
46unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
fe50f385 47/* list below must match enum in dsputil_ppc.h */
35e5fb06
RD
48static unsigned char* perfname[] = {
49 "fft_calc_altivec",
50 "gmc1_altivec",
51 "dct_unquantize_h263_altivec",
14cabd40 52 "fdct_altivec",
35e5fb06
RD
53 "idct_add_altivec",
54 "idct_put_altivec",
35e5fb06
RD
55 "put_pixels16_altivec",
56 "avg_pixels16_altivec",
57 "avg_pixels8_altivec",
58 "put_pixels8_xy2_altivec",
fe50f385
RD
59 "put_no_rnd_pixels8_xy2_altivec",
60 "put_pixels16_xy2_altivec",
61 "put_no_rnd_pixels16_xy2_altivec",
a4adb608
MN
62 "clear_blocks_dcbz32_ppc",
63 "clear_blocks_dcbz128_ppc"
35e5fb06 64};
35e5fb06
RD
65#include <stdio.h>
66#endif
67
e45a2872 68#ifdef POWERPC_PERFORMANCE_REPORT
35e5fb06
RD
69void powerpc_display_perf_report(void)
70{
e45a2872 71 int i, j;
fe50f385 72 fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
35e5fb06
RD
73 for(i = 0 ; i < powerpc_perf_total ; i++)
74 {
e45a2872
RD
75 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
76 {
77 if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
78 fprintf(stderr,
79 " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
80 perfname[i],
81 j+1,
82 perfdata[j][i][powerpc_data_min],
83 perfdata[j][i][powerpc_data_max],
84 (double)perfdata[j][i][powerpc_data_sum] /
85 (double)perfdata[j][i][powerpc_data_num],
86 perfdata[j][i][powerpc_data_num]);
87 }
35e5fb06
RD
88 }
89}
e45a2872 90#endif /* POWERPC_PERFORMANCE_REPORT */
35e5fb06
RD
91
92/* ***** WARNING ***** WARNING ***** WARNING ***** */
93/*
94 clear_blocks_dcbz32_ppc will not work properly
95 on PowerPC processors with a cache line size
96 not equal to 32 bytes.
97 Fortunately all processor used by Apple up to
98 at least the 7450 (aka second generation G4)
99 use 32 bytes cache line.
100 This is due to the use of the 'dcbz' instruction.
101 It simply clear to zero a single cache line,
102 so you need to know the cache line size to use it !
103 It's absurd, but it's fast...
a4adb608
MN
104
105 update 24/06/2003 : Apple released yesterday the G5,
106 with a PPC970. cache line size : 128 bytes. Oups.
107 The semantic of dcbz was changed, it always clear
108 32 bytes. so the function below will work, but will
109 be slow. So I fixed check_dcbz_effect to use dcbzl,
110 which is defined to clear a cache line (as dcbz before).
111 So we still can distinguish, and use dcbz (32 bytes)
112 or dcbzl (one cache line) as required.
113
114 see <http://developer.apple.com/technotes/tn/tn2087.html>
115 and <http://developer.apple.com/technotes/tn/tn2086.html>
35e5fb06
RD
116*/
117void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
118{
e45a2872 119POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
120 register int misal = ((unsigned long)blocks & 0x00000010);
121 register int i = 0;
e45a2872 122POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
123#if 1
124 if (misal) {
125 ((unsigned long*)blocks)[0] = 0L;
126 ((unsigned long*)blocks)[1] = 0L;
127 ((unsigned long*)blocks)[2] = 0L;
128 ((unsigned long*)blocks)[3] = 0L;
35e5fb06
RD
129 i += 16;
130 }
131 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
3efd4952 132 asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
35e5fb06
RD
133 }
134 if (misal) {
135 ((unsigned long*)blocks)[188] = 0L;
136 ((unsigned long*)blocks)[189] = 0L;
137 ((unsigned long*)blocks)[190] = 0L;
138 ((unsigned long*)blocks)[191] = 0L;
139 i += 16;
140 }
141#else
142 memset(blocks, 0, sizeof(DCTELEM)*6*64);
143#endif
e45a2872 144POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
35e5fb06
RD
145}
146
a4adb608
MN
147/* same as above, when dcbzl clear a whole 128B cache line
148 i.e. the PPC970 aka G5 */
149#ifndef NO_DCBZL
150void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
151{
e45a2872 152POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
153 register int misal = ((unsigned long)blocks & 0x0000007f);
154 register int i = 0;
e45a2872 155POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
156#if 1
157 if (misal) {
158 // we could probably also optimize this case,
159 // but there's not much point as the machines
160 // aren't available yet (2003-06-26)
161 memset(blocks, 0, sizeof(DCTELEM)*6*64);
162 }
163 else
164 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
3efd4952 165 asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
a4adb608
MN
166 }
167#else
168 memset(blocks, 0, sizeof(DCTELEM)*6*64);
169#endif
e45a2872 170POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
a4adb608
MN
171}
172#else
173void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
174{
175 memset(blocks, 0, sizeof(DCTELEM)*6*64);
176}
177#endif
178
179#ifndef NO_DCBZL
35e5fb06 180/* check dcbz report how many bytes are set to 0 by dcbz */
a4adb608
MN
181/* update 24/06/2003 : replace dcbz by dcbzl to get
182 the intended effect (Apple "fixed" dcbz)
183 unfortunately this cannot be used unless the assembler
184 knows about dcbzl ... */
185long check_dcbzl_effect(void)
35e5fb06 186{
3b991c54 187 register char *fakedata = (char*)av_malloc(1024);
35e5fb06
RD
188 register char *fakedata_middle;
189 register long zero = 0;
190 register long i = 0;
191 long count = 0;
192
3b991c54 193 if (!fakedata)
35e5fb06
RD
194 {
195 return 0L;
196 }
197
35e5fb06
RD
198 fakedata_middle = (fakedata + 512);
199
200 memset(fakedata, 0xFF, 1024);
201
3efd4952
RD
202 /* below the constraint "b" seems to mean "Address base register"
203 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
204 asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
35e5fb06
RD
205
206 for (i = 0; i < 1024 ; i ++)
207 {
208 if (fakedata[i] == (char)0)
209 count++;
210 }
211
3b991c54 212 av_free(fakedata);
35e5fb06
RD
213
214 return count;
215}
a4adb608
MN
216#else
217long check_dcbzl_effect(void)
218{
219 return 0;
220}
221#endif
35e5fb06 222
b0368839 223void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
ab6c65f6 224{
a4adb608 225 // Common optimizations whether Altivec is available or not
05c4072b 226
a4adb608 227 switch (check_dcbzl_effect()) {
35e5fb06
RD
228 case 32:
229 c->clear_blocks = clear_blocks_dcbz32_ppc;
230 break;
a4adb608
MN
231 case 128:
232 c->clear_blocks = clear_blocks_dcbz128_ppc;
233 break;
35e5fb06
RD
234 default:
235 break;
236 }
237
3bbd2123 238#ifdef HAVE_ALTIVEC
ab6c65f6 239 if (has_altivec()) {
404d2241
BF
240 mm_flags |= MM_ALTIVEC;
241
05c4072b 242 // Altivec specific optimisations
bb198e19
MN
243 c->pix_abs[0][1] = sad16_x2_altivec;
244 c->pix_abs[0][2] = sad16_y2_altivec;
245 c->pix_abs[0][3] = sad16_xy2_altivec;
246 c->pix_abs[0][0] = sad16_altivec;
247 c->pix_abs[1][0] = sad8_altivec;
248 c->sad[0]= sad16_altivec;
249 c->sad[1]= sad8_altivec;
f2677d6b 250 c->pix_norm1 = pix_norm1_altivec;
4013fcf4
FB
251 c->sse[1]= sse8_altivec;
252 c->sse[0]= sse16_altivec;
af19f78f
ZK
253 c->pix_sum = pix_sum_altivec;
254 c->diff_pixels = diff_pixels_altivec;
255 c->get_pixels = get_pixels_altivec;
fe50f385 256// next one disabled as it's untested.
e629ab68
RD
257#if 0
258 c->add_bytes= add_bytes_altivec;
fe50f385 259#endif /* 0 */
db40a39a 260 c->put_pixels_tab[0][0] = put_pixels16_altivec;
e45a2872
RD
261 /* the tow functions do the same thing, so use the same code */
262 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
db40a39a 263 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
35e5fb06
RD
264// next one disabled as it's untested.
265#if 0
266 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
fe50f385 267#endif /* 0 */
35e5fb06 268 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
fe50f385
RD
269 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
270 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
271 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
35e5fb06 272
e629ab68 273 c->gmc1 = gmc1_altivec;
b0368839 274
14cabd40
JK
275#ifdef CONFIG_ENCODERS
276 if (avctx->dct_algo == FF_DCT_AUTO ||
277 avctx->dct_algo == FF_DCT_ALTIVEC)
278 {
279 c->fdct = fdct_altivec;
280 }
281#endif //CONFIG_ENCODERS
282
b0368839
MN
283 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
284 (avctx->idct_algo == FF_IDCT_ALTIVEC))
285 {
286 c->idct_put = idct_put_altivec;
287 c->idct_add = idct_add_altivec;
288#ifndef ALTIVEC_USE_REFERENCE_C_CODE
289 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
290#else /* ALTIVEC_USE_REFERENCE_C_CODE */
291 c->idct_permutation_type = FF_NO_IDCT_PERM;
292#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
293 }
fe50f385 294
e45a2872 295#ifdef POWERPC_PERFORMANCE_REPORT
db40a39a 296 {
e45a2872 297 int i, j;
35e5fb06 298 for (i = 0 ; i < powerpc_perf_total ; i++)
db40a39a 299 {
e45a2872
RD
300 for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
301 {
302 perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
303 perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
304 perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
305 perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
306 }
307 }
db40a39a 308 }
e45a2872 309#endif /* POWERPC_PERFORMANCE_REPORT */
ab6c65f6 310 } else
fe50f385 311#endif /* HAVE_ALTIVEC */
ab6c65f6 312 {
05c4072b
MN
313 // Non-AltiVec PPC optimisations
314
315 // ... pending ...
ab6c65f6
BF
316 }
317}