Add more missing includes after removing the implicit common.h
[libav.git] / libavcodec / ppc / dsputil_ppc.c
CommitLineData
05c4072b
MN
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
c4a17148 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
05c4072b 5 *
2912e87a 6 * This file is part of Libav.
b78e7197 7 *
2912e87a 8 * Libav is free software; you can redistribute it and/or
05c4072b
MN
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
05c4072b 12 *
2912e87a 13 * Libav is distributed in the hope that it will be useful,
05c4072b
MN
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
2912e87a 19 * License along with Libav; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
05c4072b
MN
21 */
22
70766c21
MS
23#include <string.h>
24
c6c98d08 25#include "libavutil/cpu.h"
33e11284 26#include "libavutil/mem.h"
245976da 27#include "libavcodec/dsputil.h"
ab6c65f6 28#include "dsputil_altivec.h"
ab6c65f6 29
35e5fb06
RD
30/* ***** WARNING ***** WARNING ***** WARNING ***** */
31/*
e3905ce0
DB
32clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
33cache line size not equal to 32 bytes.
34Fortunately all processor used by Apple up to at least the 7450 (aka second
35generation G4) use 32 bytes cache line.
36This is due to the use of the 'dcbz' instruction. It simply clear to zero a
37single cache line, so you need to know the cache line size to use it !
38It's absurd, but it's fast...
a4adb608 39
e3905ce0
DB
40update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
41size: 128 bytes. Oups.
42The semantic of dcbz was changed, it always clear 32 bytes. so the function
43below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
44which is defined to clear a cache line (as dcbz before). So we still can
45distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
a4adb608 46
e3905ce0
DB
47see <http://developer.apple.com/technotes/tn/tn2087.html>
48and <http://developer.apple.com/technotes/tn/tn2086.html>
35e5fb06 49*/
ddb8c2c0 50static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
35e5fb06 51{
35e5fb06
RD
52 register int misal = ((unsigned long)blocks & 0x00000010);
53 register int i = 0;
35e5fb06 54 if (misal) {
e3905ce0
DB
55 ((unsigned long*)blocks)[0] = 0L;
56 ((unsigned long*)blocks)[1] = 0L;
57 ((unsigned long*)blocks)[2] = 0L;
58 ((unsigned long*)blocks)[3] = 0L;
59 i += 16;
35e5fb06 60 }
b1d041c1 61 for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
be449fca 62 __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
35e5fb06
RD
63 }
64 if (misal) {
e3905ce0
DB
65 ((unsigned long*)blocks)[188] = 0L;
66 ((unsigned long*)blocks)[189] = 0L;
67 ((unsigned long*)blocks)[190] = 0L;
68 ((unsigned long*)blocks)[191] = 0L;
69 i += 16;
35e5fb06 70 }
35e5fb06
RD
71}
72
a4adb608
MN
73/* same as above, when dcbzl clear a whole 128B cache line
74 i.e. the PPC970 aka G5 */
b250f9c6 75#if HAVE_DCBZL
ddb8c2c0 76static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
a4adb608 77{
a4adb608
MN
78 register int misal = ((unsigned long)blocks & 0x0000007f);
79 register int i = 0;
e3905ce0
DB
80 if (misal) {
81 // we could probably also optimize this case,
82 // but there's not much point as the machines
83 // aren't available yet (2003-06-26)
84 memset(blocks, 0, sizeof(DCTELEM)*6*64);
a4adb608
MN
85 }
86 else
e3905ce0 87 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
be449fca 88 __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
e3905ce0 89 }
a4adb608
MN
90}
91#else
ddb8c2c0 92static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
a4adb608 93{
e3905ce0 94 memset(blocks, 0, sizeof(DCTELEM)*6*64);
a4adb608
MN
95}
96#endif
97
b250f9c6 98#if HAVE_DCBZL
35e5fb06 99/* check dcbz report how many bytes are set to 0 by dcbz */
a4adb608
MN
100/* update 24/06/2003 : replace dcbz by dcbzl to get
101 the intended effect (Apple "fixed" dcbz)
102 unfortunately this cannot be used unless the assembler
103 knows about dcbzl ... */
ddb8c2c0 104static long check_dcbzl_effect(void)
35e5fb06 105{
e3905ce0
DB
106 register char *fakedata = av_malloc(1024);
107 register char *fakedata_middle;
108 register long zero = 0;
109 register long i = 0;
110 long count = 0;
35e5fb06 111
e3905ce0
DB
112 if (!fakedata) {
113 return 0L;
114 }
35e5fb06 115
e3905ce0 116 fakedata_middle = (fakedata + 512);
35e5fb06 117
e3905ce0 118 memset(fakedata, 0xFF, 1024);
35e5fb06 119
e3905ce0
DB
120 /* below the constraint "b" seems to mean "Address base register"
121 in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
be449fca 122 __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
35e5fb06 123
e3905ce0
DB
124 for (i = 0; i < 1024 ; i ++) {
125 if (fakedata[i] == (char)0)
126 count++;
127 }
35e5fb06 128
e3905ce0 129 av_free(fakedata);
115329f1 130
e3905ce0 131 return count;
35e5fb06 132}
a4adb608 133#else
ddb8c2c0 134static long check_dcbzl_effect(void)
a4adb608
MN
135{
136 return 0;
137}
138#endif
35e5fb06 139
a5db5bda
LB
140static void prefetch_ppc(void *mem, int stride, int h)
141{
142 register const uint8_t *p = mem;
143 do {
be449fca 144 __asm__ volatile ("dcbt 0,%0" : : "r" (p));
a5db5bda
LB
145 p+= stride;
146 } while(--h);
147}
148
9cf0841e 149void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
ab6c65f6 150{
a617c6aa 151 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
19a0729b 152
a1d0b6a2 153 // Common optimizations whether AltiVec is available or not
a5db5bda 154 c->prefetch = prefetch_ppc;
19a0729b 155 if (!high_bit_depth) {
73e4ff9d
LB
156 switch (check_dcbzl_effect()) {
157 case 32:
158 c->clear_blocks = clear_blocks_dcbz32_ppc;
159 break;
160 case 128:
161 c->clear_blocks = clear_blocks_dcbz128_ppc;
162 break;
163 default:
164 break;
165 }
19a0729b 166 }
a6a12a8a 167
b250f9c6 168#if HAVE_ALTIVEC
9cf0841e 169 if(CONFIG_H264_DECODER) ff_dsputil_h264_init_ppc(c, avctx);
115329f1 170
c6c98d08 171 if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
9cf0841e 172 ff_dsputil_init_altivec(c, avctx);
210f7284
MS
173 ff_float_init_altivec(c, avctx);
174 ff_int_init_altivec(c, avctx);
175 c->gmc1 = ff_gmc1_altivec;
b0368839 176
b250f9c6 177#if CONFIG_ENCODERS
0a72533e
MR
178 if (avctx->bits_per_raw_sample <= 8 &&
179 (avctx->dct_algo == FF_DCT_AUTO ||
180 avctx->dct_algo == FF_DCT_ALTIVEC)) {
210f7284 181 c->fdct = ff_fdct_altivec;
bb270c08 182 }
14cabd40
JK
183#endif //CONFIG_ENCODERS
184
2bcbd984 185 if (avctx->bits_per_raw_sample <= 8) {
e3905ce0
DB
186 if ((avctx->idct_algo == FF_IDCT_AUTO) ||
187 (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
210f7284
MS
188 c->idct_put = ff_idct_put_altivec;
189 c->idct_add = ff_idct_add_altivec;
e3905ce0
DB
190 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
191 }
b0368839 192 }
115329f1 193
ab6c65f6 194 }
75336fc8 195#endif /* HAVE_ALTIVEC */
ab6c65f6 196}