Commit | Line | Data |
---|---|---|
d2bb7db1 LM |
1 | /* |
2 | * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | |
3 | * | |
4 | * This library is free software; you can redistribute it and/or | |
5 | * modify it under the terms of the GNU Lesser General Public | |
6 | * License as published by the Free Software Foundation; either | |
7 | * version 2 of the License, or (at your option) any later version. | |
8 | * | |
9 | * This library is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | * Lesser General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU Lesser General Public | |
15 | * License along with this library; if not, write to the Free Software | |
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 | */ | |
18 | ||
19 | ||
20 | /***********************************/ | |
21 | /* IDCT */ | |
22 | ||
23 | /* in/out: mma=mma+mmb, mmb=mmb-mma */ | |
24 | #define SUMSUB_BA( a, b ) \ | |
25 | "paddw "#b", "#a" \n\t"\ | |
26 | "paddw "#b", "#b" \n\t"\ | |
27 | "psubw "#a", "#b" \n\t" | |
28 | ||
29 | #define SUMSUB_BADC( a, b, c, d ) \ | |
30 | "paddw "#b", "#a" \n\t"\ | |
31 | "paddw "#d", "#c" \n\t"\ | |
32 | "paddw "#b", "#b" \n\t"\ | |
33 | "paddw "#d", "#d" \n\t"\ | |
34 | "psubw "#a", "#b" \n\t"\ | |
35 | "psubw "#c", "#d" \n\t" | |
36 | ||
37 | #define SUMSUBD2_AB( a, b, t ) \ | |
38 | "movq "#b", "#t" \n\t"\ | |
39 | "psraw $1 , "#b" \n\t"\ | |
40 | "paddw "#a", "#b" \n\t"\ | |
41 | "psraw $1 , "#a" \n\t"\ | |
42 | "psubw "#t", "#a" \n\t" | |
43 | ||
44 | #define IDCT4_1D( s02, s13, d02, d13, t ) \ | |
45 | SUMSUB_BA ( s02, d02 )\ | |
46 | SUMSUBD2_AB( s13, d13, t )\ | |
47 | SUMSUB_BADC( d13, s02, s13, d02 ) | |
48 | ||
49 | #define SBUTTERFLY(a,b,t,n)\ | |
50 | "movq " #a ", " #t " \n\t" /* abcd */\ | |
51 | "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ | |
52 | "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | |
53 | ||
54 | #define TRANSPOSE4(a,b,c,d,t)\ | |
55 | SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ | |
56 | SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ | |
57 | SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ | |
58 | SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */ | |
59 | ||
60 | #define STORE_DIFF_4P( p, t, z ) \ | |
61 | "psraw $6, "#p" \n\t"\ | |
62 | "movd (%0), "#t" \n\t"\ | |
63 | "punpcklbw "#z", "#t" \n\t"\ | |
64 | "paddsw "#t", "#p" \n\t"\ | |
65 | "packuswb "#z", "#p" \n\t"\ | |
66 | "movd "#p", (%0) \n\t" | |
67 | ||
68 | void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
69 | { | |
70 | /* Load dct coeffs */ | |
71 | asm volatile( | |
72 | "movq (%0), %%mm0 \n\t" | |
73 | "movq 8(%0), %%mm1 \n\t" | |
74 | "movq 16(%0), %%mm2 \n\t" | |
75 | "movq 24(%0), %%mm3 \n\t" | |
76 | :: "r"(block) ); | |
77 | ||
78 | asm volatile( | |
79 | /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ | |
80 | IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) | |
81 | ||
82 | "movq %0, %%mm6 \n\t" | |
83 | /* in: 1,4,0,2 out: 1,2,3,0 */ | |
84 | TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) | |
85 | ||
86 | "paddw %%mm6, %%mm3 \n\t" | |
87 | ||
88 | /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ | |
89 | IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) | |
90 | ||
91 | "pxor %%mm7, %%mm7 \n\t" | |
92 | :: "m"(ff_pw_32)); | |
93 | ||
94 | asm volatile( | |
95 | STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) | |
96 | "add %1, %0 \n\t" | |
97 | STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) | |
98 | "add %1, %0 \n\t" | |
99 | STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) | |
100 | "add %1, %0 \n\t" | |
101 | STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) | |
102 | : "+r"(dst) | |
103 | : "r" ((long)stride) | |
104 | ); | |
105 | } | |
106 | ||
107 | ||
108 | /***********************************/ | |
109 | /* deblocking */ | |
110 | ||
111 | // out: o = |x-y|>a | |
112 | // clobbers: t | |
113 | #define DIFF_GT_MMX(x,y,a,o,t)\ | |
114 | "movq "#y", "#t" \n\t"\ | |
115 | "movq "#x", "#o" \n\t"\ | |
116 | "psubusb "#x", "#t" \n\t"\ | |
117 | "psubusb "#y", "#o" \n\t"\ | |
118 | "por "#t", "#o" \n\t"\ | |
119 | "psubusb "#a", "#o" \n\t" | |
120 | ||
121 | // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 | |
122 | // out: mm5=beta-1, mm7=mask | |
123 | // clobbers: mm4,mm6 | |
124 | #define H264_DEBLOCK_MASK(alpha1, beta1) \ | |
125 | "pshufw $0, "#alpha1", %%mm4 \n\t"\ | |
126 | "pshufw $0, "#beta1 ", %%mm5 \n\t"\ | |
127 | "packuswb %%mm4, %%mm4 \n\t"\ | |
128 | "packuswb %%mm5, %%mm5 \n\t"\ | |
129 | DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ | |
130 | DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ | |
131 | "por %%mm4, %%mm7 \n\t"\ | |
132 | DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ | |
133 | "por %%mm4, %%mm7 \n\t"\ | |
134 | "pxor %%mm6, %%mm6 \n\t"\ | |
135 | "pcmpeqb %%mm6, %%mm7 \n\t" | |
136 | ||
137 | // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) | |
138 | // out: mm1=p0' mm2=q0' | |
139 | // clobbers: mm0,3-6 | |
140 | #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ | |
141 | /* a = q0^p0^((p1-q1)>>2) */\ | |
142 | "movq %%mm0, %%mm4 \n\t"\ | |
143 | "psubb %%mm3, %%mm4 \n\t"\ | |
144 | "psrlw $2, %%mm4 \n\t"\ | |
145 | "pxor %%mm1, %%mm4 \n\t"\ | |
146 | "pxor %%mm2, %%mm4 \n\t"\ | |
147 | /* b = p0^(q1>>2) */\ | |
148 | "psrlw $2, %%mm3 \n\t"\ | |
149 | "pand "#pb_3f", %%mm3 \n\t"\ | |
150 | "movq %%mm1, %%mm5 \n\t"\ | |
151 | "pxor %%mm3, %%mm5 \n\t"\ | |
152 | /* c = q0^(p1>>2) */\ | |
153 | "psrlw $2, %%mm0 \n\t"\ | |
154 | "pand "#pb_3f", %%mm0 \n\t"\ | |
155 | "movq %%mm2, %%mm6 \n\t"\ | |
156 | "pxor %%mm0, %%mm6 \n\t"\ | |
157 | /* d = (c^b) & ~(b^a) & 1 */\ | |
158 | "pxor %%mm5, %%mm6 \n\t"\ | |
159 | "pxor %%mm4, %%mm5 \n\t"\ | |
160 | "pandn %%mm6, %%mm5 \n\t"\ | |
161 | "pand "#pb_01", %%mm5 \n\t"\ | |
162 | /* delta = (avg(q0, p1>>2) + (d&a)) | |
163 | * - (avg(p0, q1>>2) + (d&~a)) */\ | |
164 | "pavgb %%mm2, %%mm0 \n\t"\ | |
d3a9f798 MN |
165 | "pand %%mm5, %%mm4 \n\t"\ |
166 | "paddusb %%mm4, %%mm0 \n\t"\ | |
d2bb7db1 | 167 | "pavgb %%mm1, %%mm3 \n\t"\ |
d3a9f798 | 168 | "pxor %%mm5, %%mm4 \n\t"\ |
d2bb7db1 LM |
169 | "paddusb %%mm4, %%mm3 \n\t"\ |
170 | /* p0 += clip(delta, -tc0, tc0) | |
171 | * q0 -= clip(delta, -tc0, tc0) */\ | |
172 | "movq %%mm0, %%mm4 \n\t"\ | |
173 | "psubusb %%mm3, %%mm0 \n\t"\ | |
174 | "psubusb %%mm4, %%mm3 \n\t"\ | |
175 | "pminub %%mm7, %%mm0 \n\t"\ | |
176 | "pminub %%mm7, %%mm3 \n\t"\ | |
177 | "paddusb %%mm0, %%mm1 \n\t"\ | |
178 | "paddusb %%mm3, %%mm2 \n\t"\ | |
179 | "psubusb %%mm3, %%mm1 \n\t"\ | |
180 | "psubusb %%mm0, %%mm2 \n\t" | |
181 | ||
182 | // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone | |
183 | // out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) | |
184 | // clobbers: q2, tmp, tc0 | |
185 | #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ | |
186 | "movq %%mm1, "#tmp" \n\t"\ | |
187 | "pavgb %%mm2, "#tmp" \n\t"\ | |
188 | "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ | |
189 | "pxor "q2addr", "#tmp" \n\t"\ | |
190 | "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ | |
191 | "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ | |
192 | "movq "#p1", "#tmp" \n\t"\ | |
193 | "psubusb "#tc0", "#tmp" \n\t"\ | |
194 | "paddusb "#p1", "#tc0" \n\t"\ | |
195 | "pmaxub "#tmp", "#q2" \n\t"\ | |
196 | "pminub "#tc0", "#q2" \n\t"\ | |
197 | "movq "#q2", "q1addr" \n\t" | |
198 | ||
199 | static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | |
200 | { | |
201 | uint64_t tmp0; | |
202 | uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101; | |
203 | // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask | |
204 | uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff }; | |
205 | ||
206 | asm volatile( | |
207 | "movq (%1,%3), %%mm0 \n\t" //p1 | |
208 | "movq (%1,%3,2), %%mm1 \n\t" //p0 | |
209 | "movq (%2), %%mm2 \n\t" //q0 | |
210 | "movq (%2,%3), %%mm3 \n\t" //q1 | |
211 | H264_DEBLOCK_MASK(%6, %7) | |
212 | "pand %5, %%mm7 \n\t" | |
213 | "movq %%mm7, %0 \n\t" | |
214 | ||
215 | /* filter p1 */ | |
216 | "movq (%1), %%mm3 \n\t" //p2 | |
217 | DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 | |
218 | "pandn %%mm7, %%mm6 \n\t" | |
219 | "pcmpeqb %%mm7, %%mm6 \n\t" | |
220 | "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta | |
221 | "pshufw $80, %4, %%mm4 \n\t" | |
222 | "pand %%mm7, %%mm4 \n\t" // mask & tc0 | |
223 | "movq %8, %%mm7 \n\t" | |
224 | "pand %%mm6, %%mm7 \n\t" // mask & |p2-p0|<beta & 1 | |
225 | "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 | |
226 | "paddb %%mm4, %%mm7 \n\t" // tc++ | |
227 | H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) | |
228 | ||
229 | /* filter q1 */ | |
230 | "movq (%2,%3,2), %%mm4 \n\t" //q2 | |
231 | DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 | |
232 | "pandn %0, %%mm6 \n\t" | |
233 | "pcmpeqb %0, %%mm6 \n\t" | |
234 | "pand %0, %%mm6 \n\t" | |
235 | "pshufw $80, %4, %%mm5 \n\t" | |
236 | "pand %%mm6, %%mm5 \n\t" | |
237 | "pand %8, %%mm6 \n\t" | |
238 | "paddb %%mm6, %%mm7 \n\t" | |
239 | "movq (%2,%3), %%mm3 \n\t" | |
240 | H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) | |
241 | ||
242 | /* filter p0, q0 */ | |
243 | H264_DEBLOCK_P0_Q0(%8, %9) | |
244 | "movq %%mm1, (%1,%3,2) \n\t" | |
245 | "movq %%mm2, (%2) \n\t" | |
246 | ||
247 | : "=m"(tmp0) | |
248 | : "r"(pix-3*stride), "r"(pix), "r"((long)stride), | |
249 | "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1), | |
250 | "m"(mm_bone), "m"(ff_pb_3F) | |
251 | ); | |
252 | } | |
253 | ||
254 | static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
255 | { | |
256 | if((tc0[0] & tc0[1]) >= 0) | |
257 | h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); | |
258 | if((tc0[2] & tc0[3]) >= 0) | |
259 | h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); | |
260 | } | |
261 | static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
262 | { | |
263 | //FIXME: could cut some load/stores by merging transpose with filter | |
264 | // also, it only needs to transpose 6x8 | |
265 | uint8_t trans[8*8]; | |
266 | int i; | |
267 | for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { | |
268 | if((tc0[0] & tc0[1]) < 0) | |
269 | continue; | |
270 | transpose4x4(trans, pix-4, 8, stride); | |
271 | transpose4x4(trans +4*8, pix, 8, stride); | |
272 | transpose4x4(trans+4, pix-4+4*stride, 8, stride); | |
273 | transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); | |
274 | h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); | |
275 | transpose4x4(pix-2, trans +2*8, stride, 8); | |
276 | transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); | |
277 | } | |
278 | } | |
279 | ||
280 | static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | |
281 | { | |
282 | asm volatile( | |
283 | "movq (%0), %%mm0 \n\t" //p1 | |
284 | "movq (%0,%2), %%mm1 \n\t" //p0 | |
285 | "movq (%1), %%mm2 \n\t" //q0 | |
286 | "movq (%1,%2), %%mm3 \n\t" //q1 | |
287 | H264_DEBLOCK_MASK(%4, %5) | |
288 | "movd %3, %%mm6 \n\t" | |
289 | "punpcklbw %%mm6, %%mm6 \n\t" | |
290 | "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask | |
291 | H264_DEBLOCK_P0_Q0(%6, %7) | |
292 | "movq %%mm1, (%0,%2) \n\t" | |
293 | "movq %%mm2, (%1) \n\t" | |
294 | ||
295 | :: "r"(pix-2*stride), "r"(pix), "r"((long)stride), | |
296 | "r"(*(uint32_t*)tc0), | |
297 | "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F) | |
298 | ); | |
299 | } | |
300 | ||
301 | static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
302 | { | |
303 | h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); | |
304 | } | |
305 | ||
306 | static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | |
307 | { | |
308 | //FIXME: could cut some load/stores by merging transpose with filter | |
309 | uint8_t trans[8*4]; | |
310 | transpose4x4(trans, pix-2, 8, stride); | |
311 | transpose4x4(trans+4, pix-2+4*stride, 8, stride); | |
312 | h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); | |
313 | transpose4x4(pix-2, trans, stride, 8); | |
314 | transpose4x4(pix-2+4*stride, trans+4, stride, 8); | |
315 | } | |
316 | ||
317 | // p0 = (p0 + q1 + 2*p1 + 2) >> 2 | |
318 | #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ | |
319 | "movq "#p0", %%mm4 \n\t"\ | |
320 | "pxor "#q1", %%mm4 \n\t"\ | |
321 | "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ | |
322 | "pavgb "#q1", "#p0" \n\t"\ | |
323 | "psubusb %%mm4, "#p0" \n\t"\ | |
324 | "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ | |
325 | ||
326 | static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) | |
327 | { | |
328 | asm volatile( | |
329 | "movq (%0), %%mm0 \n\t" | |
330 | "movq (%0,%2), %%mm1 \n\t" | |
331 | "movq (%1), %%mm2 \n\t" | |
332 | "movq (%1,%2), %%mm3 \n\t" | |
333 | H264_DEBLOCK_MASK(%3, %4) | |
334 | "movq %%mm1, %%mm5 \n\t" | |
335 | "movq %%mm2, %%mm6 \n\t" | |
336 | H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' | |
337 | H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' | |
338 | "psubb %%mm5, %%mm1 \n\t" | |
339 | "psubb %%mm6, %%mm2 \n\t" | |
340 | "pand %%mm7, %%mm1 \n\t" | |
341 | "pand %%mm7, %%mm2 \n\t" | |
342 | "paddb %%mm5, %%mm1 \n\t" | |
343 | "paddb %%mm6, %%mm2 \n\t" | |
344 | "movq %%mm1, (%0,%2) \n\t" | |
345 | "movq %%mm2, (%1) \n\t" | |
346 | :: "r"(pix-2*stride), "r"(pix), "r"((long)stride), | |
347 | "m"(alpha1), "m"(beta1), "m"(mm_bone) | |
348 | ); | |
349 | } | |
350 | ||
351 | static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) | |
352 | { | |
353 | h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); | |
354 | } | |
355 | ||
356 | static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) | |
357 | { | |
358 | //FIXME: could cut some load/stores by merging transpose with filter | |
359 | uint8_t trans[8*4]; | |
360 | transpose4x4(trans, pix-2, 8, stride); | |
361 | transpose4x4(trans+4, pix-2+4*stride, 8, stride); | |
362 | h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); | |
363 | transpose4x4(pix-2, trans, stride, 8); | |
364 | transpose4x4(pix-2+4*stride, trans+4, stride, 8); | |
365 | } | |
366 | ||
367 | ||
368 | /***********************************/ | |
369 | /* motion compensation */ | |
370 | ||
371 | #define QPEL_H264V(A,B,C,D,E,F,OP)\ | |
372 | "movd (%0), "#F" \n\t"\ | |
373 | "movq "#C", %%mm6 \n\t"\ | |
374 | "paddw "#D", %%mm6 \n\t"\ | |
375 | "psllw $2, %%mm6 \n\t"\ | |
376 | "psubw "#B", %%mm6 \n\t"\ | |
377 | "psubw "#E", %%mm6 \n\t"\ | |
378 | "pmullw %4, %%mm6 \n\t"\ | |
379 | "add %2, %0 \n\t"\ | |
380 | "punpcklbw %%mm7, "#F" \n\t"\ | |
381 | "paddw %5, "#A" \n\t"\ | |
382 | "paddw "#F", "#A" \n\t"\ | |
383 | "paddw "#A", %%mm6 \n\t"\ | |
384 | "psraw $5, %%mm6 \n\t"\ | |
385 | "packuswb %%mm6, %%mm6 \n\t"\ | |
386 | OP(%%mm6, (%1), A, d)\ | |
387 | "add %3, %1 \n\t" | |
388 | ||
389 | #define QPEL_H264HV(A,B,C,D,E,F,OF)\ | |
390 | "movd (%0), "#F" \n\t"\ | |
391 | "movq "#C", %%mm6 \n\t"\ | |
392 | "paddw "#D", %%mm6 \n\t"\ | |
393 | "psllw $2, %%mm6 \n\t"\ | |
394 | "psubw "#B", %%mm6 \n\t"\ | |
395 | "psubw "#E", %%mm6 \n\t"\ | |
396 | "pmullw %3, %%mm6 \n\t"\ | |
397 | "add %2, %0 \n\t"\ | |
398 | "punpcklbw %%mm7, "#F" \n\t"\ | |
399 | "paddw "#F", "#A" \n\t"\ | |
400 | "paddw "#A", %%mm6 \n\t"\ | |
401 | "movq %%mm6, "#OF"(%1) \n\t" | |
402 | ||
403 | #define QPEL_H264(OPNAME, OP, MMX)\ | |
404 | static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
405 | int h=4;\ | |
406 | \ | |
407 | asm volatile(\ | |
408 | "pxor %%mm7, %%mm7 \n\t"\ | |
409 | "movq %5, %%mm4 \n\t"\ | |
410 | "movq %6, %%mm5 \n\t"\ | |
411 | "1: \n\t"\ | |
412 | "movd -1(%0), %%mm1 \n\t"\ | |
413 | "movd (%0), %%mm2 \n\t"\ | |
414 | "movd 1(%0), %%mm3 \n\t"\ | |
415 | "movd 2(%0), %%mm0 \n\t"\ | |
416 | "punpcklbw %%mm7, %%mm1 \n\t"\ | |
417 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
418 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
419 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
420 | "paddw %%mm0, %%mm1 \n\t"\ | |
421 | "paddw %%mm3, %%mm2 \n\t"\ | |
422 | "movd -2(%0), %%mm0 \n\t"\ | |
423 | "movd 3(%0), %%mm3 \n\t"\ | |
424 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
425 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
426 | "paddw %%mm3, %%mm0 \n\t"\ | |
427 | "psllw $2, %%mm2 \n\t"\ | |
428 | "psubw %%mm1, %%mm2 \n\t"\ | |
429 | "pmullw %%mm4, %%mm2 \n\t"\ | |
430 | "paddw %%mm5, %%mm0 \n\t"\ | |
431 | "paddw %%mm2, %%mm0 \n\t"\ | |
432 | "psraw $5, %%mm0 \n\t"\ | |
433 | "packuswb %%mm0, %%mm0 \n\t"\ | |
434 | OP(%%mm0, (%1),%%mm6, d)\ | |
435 | "add %3, %0 \n\t"\ | |
436 | "add %4, %1 \n\t"\ | |
437 | "decl %2 \n\t"\ | |
438 | " jnz 1b \n\t"\ | |
439 | : "+a"(src), "+c"(dst), "+m"(h)\ | |
440 | : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
441 | : "memory"\ | |
442 | );\ | |
443 | }\ | |
444 | static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
445 | src -= 2*srcStride;\ | |
446 | asm volatile(\ | |
447 | "pxor %%mm7, %%mm7 \n\t"\ | |
448 | "movd (%0), %%mm0 \n\t"\ | |
449 | "add %2, %0 \n\t"\ | |
450 | "movd (%0), %%mm1 \n\t"\ | |
451 | "add %2, %0 \n\t"\ | |
452 | "movd (%0), %%mm2 \n\t"\ | |
453 | "add %2, %0 \n\t"\ | |
454 | "movd (%0), %%mm3 \n\t"\ | |
455 | "add %2, %0 \n\t"\ | |
456 | "movd (%0), %%mm4 \n\t"\ | |
457 | "add %2, %0 \n\t"\ | |
458 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
459 | "punpcklbw %%mm7, %%mm1 \n\t"\ | |
460 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
461 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
462 | "punpcklbw %%mm7, %%mm4 \n\t"\ | |
463 | QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
464 | QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
465 | QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
466 | QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
467 | \ | |
468 | : "+a"(src), "+c"(dst)\ | |
469 | : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
470 | : "memory"\ | |
471 | );\ | |
472 | }\ | |
473 | static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
474 | int h=4;\ | |
475 | int w=3;\ | |
476 | src -= 2*srcStride+2;\ | |
477 | while(w--){\ | |
478 | asm volatile(\ | |
479 | "pxor %%mm7, %%mm7 \n\t"\ | |
480 | "movd (%0), %%mm0 \n\t"\ | |
481 | "add %2, %0 \n\t"\ | |
482 | "movd (%0), %%mm1 \n\t"\ | |
483 | "add %2, %0 \n\t"\ | |
484 | "movd (%0), %%mm2 \n\t"\ | |
485 | "add %2, %0 \n\t"\ | |
486 | "movd (%0), %%mm3 \n\t"\ | |
487 | "add %2, %0 \n\t"\ | |
488 | "movd (%0), %%mm4 \n\t"\ | |
489 | "add %2, %0 \n\t"\ | |
490 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
491 | "punpcklbw %%mm7, %%mm1 \n\t"\ | |
492 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
493 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
494 | "punpcklbw %%mm7, %%mm4 \n\t"\ | |
495 | QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ | |
496 | QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ | |
497 | QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ | |
498 | QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ | |
499 | \ | |
500 | : "+a"(src)\ | |
501 | : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ | |
502 | : "memory"\ | |
503 | );\ | |
504 | tmp += 4;\ | |
505 | src += 4 - 9*srcStride;\ | |
506 | }\ | |
507 | tmp -= 3*4;\ | |
508 | asm volatile(\ | |
509 | "movq %4, %%mm6 \n\t"\ | |
510 | "1: \n\t"\ | |
511 | "movq (%0), %%mm0 \n\t"\ | |
512 | "paddw 10(%0), %%mm0 \n\t"\ | |
513 | "movq 2(%0), %%mm1 \n\t"\ | |
514 | "paddw 8(%0), %%mm1 \n\t"\ | |
515 | "movq 4(%0), %%mm2 \n\t"\ | |
516 | "paddw 6(%0), %%mm2 \n\t"\ | |
517 | "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ | |
518 | "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ | |
519 | "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ | |
520 | "paddsw %%mm2, %%mm0 \n\t"\ | |
521 | "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\ | |
522 | "paddw %%mm6, %%mm2 \n\t"\ | |
523 | "paddw %%mm2, %%mm0 \n\t"\ | |
524 | "psraw $6, %%mm0 \n\t"\ | |
525 | "packuswb %%mm0, %%mm0 \n\t"\ | |
526 | OP(%%mm0, (%1),%%mm7, d)\ | |
527 | "add $24, %0 \n\t"\ | |
528 | "add %3, %1 \n\t"\ | |
529 | "decl %2 \n\t"\ | |
530 | " jnz 1b \n\t"\ | |
531 | : "+a"(tmp), "+c"(dst), "+m"(h)\ | |
532 | : "S"((long)dstStride), "m"(ff_pw_32)\ | |
533 | : "memory"\ | |
534 | );\ | |
535 | }\ | |
536 | \ | |
537 | static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
538 | int h=8;\ | |
539 | asm volatile(\ | |
540 | "pxor %%mm7, %%mm7 \n\t"\ | |
541 | "movq %5, %%mm6 \n\t"\ | |
542 | "1: \n\t"\ | |
543 | "movq (%0), %%mm0 \n\t"\ | |
544 | "movq 1(%0), %%mm2 \n\t"\ | |
545 | "movq %%mm0, %%mm1 \n\t"\ | |
546 | "movq %%mm2, %%mm3 \n\t"\ | |
547 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
548 | "punpckhbw %%mm7, %%mm1 \n\t"\ | |
549 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
550 | "punpckhbw %%mm7, %%mm3 \n\t"\ | |
551 | "paddw %%mm2, %%mm0 \n\t"\ | |
552 | "paddw %%mm3, %%mm1 \n\t"\ | |
553 | "psllw $2, %%mm0 \n\t"\ | |
554 | "psllw $2, %%mm1 \n\t"\ | |
555 | "movq -1(%0), %%mm2 \n\t"\ | |
556 | "movq 2(%0), %%mm4 \n\t"\ | |
557 | "movq %%mm2, %%mm3 \n\t"\ | |
558 | "movq %%mm4, %%mm5 \n\t"\ | |
559 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
560 | "punpckhbw %%mm7, %%mm3 \n\t"\ | |
561 | "punpcklbw %%mm7, %%mm4 \n\t"\ | |
562 | "punpckhbw %%mm7, %%mm5 \n\t"\ | |
563 | "paddw %%mm4, %%mm2 \n\t"\ | |
564 | "paddw %%mm3, %%mm5 \n\t"\ | |
565 | "psubw %%mm2, %%mm0 \n\t"\ | |
566 | "psubw %%mm5, %%mm1 \n\t"\ | |
567 | "pmullw %%mm6, %%mm0 \n\t"\ | |
568 | "pmullw %%mm6, %%mm1 \n\t"\ | |
569 | "movd -2(%0), %%mm2 \n\t"\ | |
570 | "movd 7(%0), %%mm5 \n\t"\ | |
571 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
572 | "punpcklbw %%mm7, %%mm5 \n\t"\ | |
573 | "paddw %%mm3, %%mm2 \n\t"\ | |
574 | "paddw %%mm5, %%mm4 \n\t"\ | |
575 | "movq %6, %%mm5 \n\t"\ | |
576 | "paddw %%mm5, %%mm2 \n\t"\ | |
577 | "paddw %%mm5, %%mm4 \n\t"\ | |
578 | "paddw %%mm2, %%mm0 \n\t"\ | |
579 | "paddw %%mm4, %%mm1 \n\t"\ | |
580 | "psraw $5, %%mm0 \n\t"\ | |
581 | "psraw $5, %%mm1 \n\t"\ | |
582 | "packuswb %%mm1, %%mm0 \n\t"\ | |
583 | OP(%%mm0, (%1),%%mm5, q)\ | |
584 | "add %3, %0 \n\t"\ | |
585 | "add %4, %1 \n\t"\ | |
586 | "decl %2 \n\t"\ | |
587 | " jnz 1b \n\t"\ | |
588 | : "+a"(src), "+c"(dst), "+m"(h)\ | |
589 | : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
590 | : "memory"\ | |
591 | );\ | |
592 | }\ | |
593 | \ | |
594 | static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
595 | int h= 2;\ | |
596 | src -= 2*srcStride;\ | |
597 | \ | |
598 | while(h--){\ | |
599 | asm volatile(\ | |
600 | "pxor %%mm7, %%mm7 \n\t"\ | |
601 | "movd (%0), %%mm0 \n\t"\ | |
602 | "add %2, %0 \n\t"\ | |
603 | "movd (%0), %%mm1 \n\t"\ | |
604 | "add %2, %0 \n\t"\ | |
605 | "movd (%0), %%mm2 \n\t"\ | |
606 | "add %2, %0 \n\t"\ | |
607 | "movd (%0), %%mm3 \n\t"\ | |
608 | "add %2, %0 \n\t"\ | |
609 | "movd (%0), %%mm4 \n\t"\ | |
610 | "add %2, %0 \n\t"\ | |
611 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
612 | "punpcklbw %%mm7, %%mm1 \n\t"\ | |
613 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
614 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
615 | "punpcklbw %%mm7, %%mm4 \n\t"\ | |
616 | QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
617 | QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
618 | QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
619 | QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
620 | QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | |
621 | QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | |
622 | QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
623 | QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
624 | \ | |
625 | : "+a"(src), "+c"(dst)\ | |
626 | : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
627 | : "memory"\ | |
628 | );\ | |
629 | src += 4-13*srcStride;\ | |
630 | dst += 4-8*dstStride;\ | |
631 | }\ | |
632 | }\ | |
633 | static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
634 | int h=8;\ | |
635 | int w=4;\ | |
636 | src -= 2*srcStride+2;\ | |
637 | while(w--){\ | |
638 | asm volatile(\ | |
639 | "pxor %%mm7, %%mm7 \n\t"\ | |
640 | "movd (%0), %%mm0 \n\t"\ | |
641 | "add %2, %0 \n\t"\ | |
642 | "movd (%0), %%mm1 \n\t"\ | |
643 | "add %2, %0 \n\t"\ | |
644 | "movd (%0), %%mm2 \n\t"\ | |
645 | "add %2, %0 \n\t"\ | |
646 | "movd (%0), %%mm3 \n\t"\ | |
647 | "add %2, %0 \n\t"\ | |
648 | "movd (%0), %%mm4 \n\t"\ | |
649 | "add %2, %0 \n\t"\ | |
650 | "punpcklbw %%mm7, %%mm0 \n\t"\ | |
651 | "punpcklbw %%mm7, %%mm1 \n\t"\ | |
652 | "punpcklbw %%mm7, %%mm2 \n\t"\ | |
653 | "punpcklbw %%mm7, %%mm3 \n\t"\ | |
654 | "punpcklbw %%mm7, %%mm4 \n\t"\ | |
655 | QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ | |
656 | QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ | |
657 | QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ | |
658 | QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ | |
659 | QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ | |
660 | QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ | |
661 | QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ | |
662 | QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ | |
663 | \ | |
664 | : "+a"(src)\ | |
665 | : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ | |
666 | : "memory"\ | |
667 | );\ | |
668 | tmp += 4;\ | |
669 | src += 4 - 13*srcStride;\ | |
670 | }\ | |
671 | tmp -= 4*4;\ | |
672 | asm volatile(\ | |
673 | "movq %4, %%mm6 \n\t"\ | |
674 | "1: \n\t"\ | |
675 | "movq (%0), %%mm0 \n\t"\ | |
676 | "movq 8(%0), %%mm3 \n\t"\ | |
677 | "movq 2(%0), %%mm1 \n\t"\ | |
678 | "movq 10(%0), %%mm4 \n\t"\ | |
679 | "paddw %%mm4, %%mm0 \n\t"\ | |
680 | "paddw %%mm3, %%mm1 \n\t"\ | |
681 | "paddw 18(%0), %%mm3 \n\t"\ | |
682 | "paddw 16(%0), %%mm4 \n\t"\ | |
683 | "movq 4(%0), %%mm2 \n\t"\ | |
684 | "movq 12(%0), %%mm5 \n\t"\ | |
685 | "paddw 6(%0), %%mm2 \n\t"\ | |
686 | "paddw 14(%0), %%mm5 \n\t"\ | |
687 | "psubw %%mm1, %%mm0 \n\t"\ | |
688 | "psubw %%mm4, %%mm3 \n\t"\ | |
689 | "psraw $2, %%mm0 \n\t"\ | |
690 | "psraw $2, %%mm3 \n\t"\ | |
691 | "psubw %%mm1, %%mm0 \n\t"\ | |
692 | "psubw %%mm4, %%mm3 \n\t"\ | |
693 | "paddsw %%mm2, %%mm0 \n\t"\ | |
694 | "paddsw %%mm5, %%mm3 \n\t"\ | |
695 | "psraw $2, %%mm0 \n\t"\ | |
696 | "psraw $2, %%mm3 \n\t"\ | |
697 | "paddw %%mm6, %%mm2 \n\t"\ | |
698 | "paddw %%mm6, %%mm5 \n\t"\ | |
699 | "paddw %%mm2, %%mm0 \n\t"\ | |
700 | "paddw %%mm5, %%mm3 \n\t"\ | |
701 | "psraw $6, %%mm0 \n\t"\ | |
702 | "psraw $6, %%mm3 \n\t"\ | |
703 | "packuswb %%mm3, %%mm0 \n\t"\ | |
704 | OP(%%mm0, (%1),%%mm7, q)\ | |
705 | "add $32, %0 \n\t"\ | |
706 | "add %3, %1 \n\t"\ | |
707 | "decl %2 \n\t"\ | |
708 | " jnz 1b \n\t"\ | |
709 | : "+a"(tmp), "+c"(dst), "+m"(h)\ | |
710 | : "S"((long)dstStride), "m"(ff_pw_32)\ | |
711 | : "memory"\ | |
712 | );\ | |
713 | }\ | |
714 | static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
715 | OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
716 | OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
717 | src += 8*srcStride;\ | |
718 | dst += 8*dstStride;\ | |
719 | OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
720 | OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
721 | }\ | |
722 | \ | |
723 | static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
724 | OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
725 | OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
726 | src += 8*srcStride;\ | |
727 | dst += 8*dstStride;\ | |
728 | OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
729 | OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
730 | }\ | |
731 | \ | |
732 | static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
733 | OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
734 | OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\ | |
735 | src += 8*srcStride;\ | |
736 | dst += 8*dstStride;\ | |
737 | OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
738 | OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\ | |
739 | }\ | |
740 | ||
741 | #define H264_MC(OPNAME, SIZE, MMX) \ | |
742 | static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
743 | OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\ | |
744 | }\ | |
745 | \ | |
746 | static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
747 | uint64_t temp[SIZE*SIZE/8];\ | |
748 | uint8_t * const half= (uint8_t*)temp;\ | |
749 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
750 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ | |
751 | }\ | |
752 | \ | |
753 | static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
754 | OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ | |
755 | }\ | |
756 | \ | |
757 | static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
758 | uint64_t temp[SIZE*SIZE/8];\ | |
759 | uint8_t * const half= (uint8_t*)temp;\ | |
760 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
761 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\ | |
762 | }\ | |
763 | \ | |
764 | static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
765 | uint64_t temp[SIZE*SIZE/8];\ | |
766 | uint8_t * const half= (uint8_t*)temp;\ | |
767 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
768 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ | |
769 | }\ | |
770 | \ | |
771 | static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
772 | OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
773 | }\ | |
774 | \ | |
775 | static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
776 | uint64_t temp[SIZE*SIZE/8];\ | |
777 | uint8_t * const half= (uint8_t*)temp;\ | |
778 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
779 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\ | |
780 | }\ | |
781 | \ | |
782 | static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
783 | uint64_t temp[SIZE*SIZE/4];\ | |
784 | uint8_t * const halfH= (uint8_t*)temp;\ | |
785 | uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
786 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
787 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
788 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
789 | }\ | |
790 | \ | |
791 | static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
792 | uint64_t temp[SIZE*SIZE/4];\ | |
793 | uint8_t * const halfH= (uint8_t*)temp;\ | |
794 | uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
795 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
796 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
797 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
798 | }\ | |
799 | \ | |
800 | static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
801 | uint64_t temp[SIZE*SIZE/4];\ | |
802 | uint8_t * const halfH= (uint8_t*)temp;\ | |
803 | uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
804 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
805 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
806 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
807 | }\ | |
808 | \ | |
809 | static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
810 | uint64_t temp[SIZE*SIZE/4];\ | |
811 | uint8_t * const halfH= (uint8_t*)temp;\ | |
812 | uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
813 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
814 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
815 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
816 | }\ | |
817 | \ | |
818 | static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
819 | uint64_t temp[SIZE*(SIZE+8)/4];\ | |
820 | int16_t * const tmp= (int16_t*)temp;\ | |
821 | OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\ | |
822 | }\ | |
823 | \ | |
824 | static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
825 | uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ | |
826 | uint8_t * const halfH= (uint8_t*)temp;\ | |
827 | uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
828 | int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ | |
829 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
830 | put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
831 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ | |
832 | }\ | |
833 | \ | |
834 | static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
835 | uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ | |
836 | uint8_t * const halfH= (uint8_t*)temp;\ | |
837 | uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
838 | int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ | |
839 | put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
840 | put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
841 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ | |
842 | }\ | |
843 | \ | |
844 | static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
845 | uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ | |
846 | uint8_t * const halfV= (uint8_t*)temp;\ | |
847 | uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
848 | int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ | |
849 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
850 | put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
851 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ | |
852 | }\ | |
853 | \ | |
854 | static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
855 | uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ | |
856 | uint8_t * const halfV= (uint8_t*)temp;\ | |
857 | uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
858 | int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ | |
859 | put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
860 | put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
861 | OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ | |
862 | }\ | |
863 | ||
864 | ||
865 | #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
866 | #define AVG_3DNOW_OP(a,b,temp, size) \ | |
867 | "mov" #size " " #b ", " #temp " \n\t"\ | |
868 | "pavgusb " #temp ", " #a " \n\t"\ | |
869 | "mov" #size " " #a ", " #b " \n\t" | |
870 | #define AVG_MMX2_OP(a,b,temp, size) \ | |
871 | "mov" #size " " #b ", " #temp " \n\t"\ | |
872 | "pavgb " #temp ", " #a " \n\t"\ | |
873 | "mov" #size " " #a ", " #b " \n\t" | |
874 | ||
875 | QPEL_H264(put_, PUT_OP, 3dnow) | |
876 | QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) | |
877 | QPEL_H264(put_, PUT_OP, mmx2) | |
878 | QPEL_H264(avg_, AVG_MMX2_OP, mmx2) | |
879 | ||
880 | H264_MC(put_, 4, 3dnow) | |
881 | H264_MC(put_, 8, 3dnow) | |
882 | H264_MC(put_, 16,3dnow) | |
883 | H264_MC(avg_, 4, 3dnow) | |
884 | H264_MC(avg_, 8, 3dnow) | |
885 | H264_MC(avg_, 16,3dnow) | |
886 | H264_MC(put_, 4, mmx2) | |
887 | H264_MC(put_, 8, mmx2) | |
888 | H264_MC(put_, 16,mmx2) | |
889 | H264_MC(avg_, 4, mmx2) | |
890 | H264_MC(avg_, 8, mmx2) | |
891 | H264_MC(avg_, 16,mmx2) | |
892 | ||
893 | ||
894 | #define H264_CHROMA_OP(S,D) | |
895 | #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx | |
896 | #include "dsputil_h264_template_mmx.c" | |
897 | #undef H264_CHROMA_OP | |
898 | #undef H264_CHROMA_MC8_TMPL | |
899 | ||
900 | #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" | |
901 | #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2 | |
902 | #include "dsputil_h264_template_mmx.c" | |
903 | #undef H264_CHROMA_OP | |
904 | #undef H264_CHROMA_MC8_TMPL | |
905 | ||
906 | #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" | |
907 | #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow | |
908 | #include "dsputil_h264_template_mmx.c" | |
909 | #undef H264_CHROMA_OP | |
910 | #undef H264_CHROMA_MC8_TMPL | |
911 |