seperated out the C-based VP3 DSP functions into a different file; also
[libav.git] / libavcodec / i386 / vp3dsp_mmx.c
1 /*
2 * Copyright (C) 2004 the ffmpeg project
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 /**
20 * @file vp3dsp_mmx.c
21 * MMX-optimized functions cribbed from the original VP3 source code.
22 */
23
24 #include "../dsputil.h"
25 #include "mmx.h"
26
27 #define IdctAdjustBeforeShift 8
28
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30 * idct_constants[0..15] = Mask table (M(I))
31 * idct_constants[16..43] = Cosine table (C(I))
32 * idct_constants[44..47] = 8
33 */
34 static uint16_t idct_constants[(4 + 7 + 1) * 4];
35 static uint16_t idct_cosine_table[7] = {
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
37 };
38
39 #define r0 mm0
40 #define r1 mm1
41 #define r2 mm2
42 #define r3 mm3
43 #define r4 mm4
44 #define r5 mm5
45 #define r6 mm6
46 #define r7 mm7
47
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
49 #define BeginIDCT() \
50 movq_m2r(*I(3), r2); \
51 movq_m2r(*C(3), r6); \
52 movq_r2r(r2, r4); \
53 movq_m2r(*J(5), r7); \
54 pmulhw_r2r(r6, r4); \
55 movq_m2r(*C(5), r1); \
56 pmulhw_r2r(r7, r6); \
57 movq_r2r(r1, r5); \
58 pmulhw_r2r(r2, r1); \
59 movq_m2r(*I(1), r3); \
60 pmulhw_r2r(r7, r5); \
61 movq_m2r(*C(1), r0); \
62 paddw_r2r(r2, r4); \
63 paddw_r2r(r7, r6); \
64 paddw_r2r(r1, r2); \
65 movq_m2r(*J(7), r1); \
66 paddw_r2r(r5, r7); \
67 movq_r2r(r0, r5); \
68 pmulhw_r2r(r3, r0); \
69 paddsw_r2r(r7, r4); \
70 pmulhw_r2r(r1, r5); \
71 movq_m2r(*C(7), r7); \
72 psubsw_r2r(r2, r6); \
73 paddw_r2r(r3, r0); \
74 pmulhw_r2r(r7, r3); \
75 movq_m2r(*I(2), r2); \
76 pmulhw_r2r(r1, r7); \
77 paddw_r2r(r1, r5); \
78 movq_r2r(r2, r1); \
79 pmulhw_m2r(*C(2), r2); \
80 psubsw_r2r(r5, r3); \
81 movq_m2r(*J(6), r5); \
82 paddsw_r2r(r7, r0); \
83 movq_r2r(r5, r7); \
84 psubsw_r2r(r4, r0); \
85 pmulhw_m2r(*C(2), r5); \
86 paddw_r2r(r1, r2); \
87 pmulhw_m2r(*C(6), r1); \
88 paddsw_r2r(r4, r4); \
89 paddsw_r2r(r0, r4); \
90 psubsw_r2r(r6, r3); \
91 paddw_r2r(r7, r5); \
92 paddsw_r2r(r6, r6); \
93 pmulhw_m2r(*C(6), r7); \
94 paddsw_r2r(r3, r6); \
95 movq_r2m(r4, *I(1)); \
96 psubsw_r2r(r5, r1); \
97 movq_m2r(*C(4), r4); \
98 movq_r2r(r3, r5); \
99 pmulhw_r2r(r4, r3); \
100 paddsw_r2r(r2, r7); \
101 movq_r2m(r6, *I(2)); \
102 movq_r2r(r0, r2); \
103 movq_m2r(*I(0), r6); \
104 pmulhw_r2r(r4, r0); \
105 paddw_r2r(r3, r5); \
106 movq_m2r(*J(4), r3); \
107 psubsw_r2r(r1, r5); \
108 paddw_r2r(r0, r2); \
109 psubsw_r2r(r3, r6); \
110 movq_r2r(r6, r0); \
111 pmulhw_r2r(r4, r6); \
112 paddsw_r2r(r3, r3); \
113 paddsw_r2r(r1, r1); \
114 paddsw_r2r(r0, r3); \
115 paddsw_r2r(r5, r1); \
116 pmulhw_r2r(r3, r4); \
117 paddsw_r2r(r0, r6); \
118 psubsw_r2r(r2, r6); \
119 paddsw_r2r(r2, r2); \
120 movq_m2r(*I(1), r0); \
121 paddsw_r2r(r6, r2); \
122 paddw_r2r(r3, r4); \
123 psubsw_r2r(r1, r2);
124
125 /* RowIDCT gets ready to transpose */
126 #define RowIDCT() \
127 \
128 BeginIDCT() \
129 \
130 movq_m2r(*I(2), r3); \
131 psubsw_r2r(r7, r4); \
132 paddsw_r2r(r1, r1); \
133 paddsw_r2r(r7, r7); \
134 paddsw_r2r(r2, r1); \
135 paddsw_r2r(r4, r7); \
136 psubsw_r2r(r3, r4); \
137 psubsw_r2r(r5, r6); \
138 paddsw_r2r(r5, r5); \
139 paddsw_r2r(r4, r3); \
140 paddsw_r2r(r6, r5); \
141 psubsw_r2r(r0, r7); \
142 paddsw_r2r(r0, r0); \
143 movq_r2m(r1, *I(1)); \
144 paddsw_r2r(r7, r0);
145
146 /* Column IDCT normalizes and stores final results */
147 #define ColumnIDCT() \
148 \
149 BeginIDCT() \
150 \
151 paddsw_m2r(*Eight, r2); \
152 paddsw_r2r(r1, r1); \
153 paddsw_r2r(r2, r1); \
154 psraw_i2r(4, r2); \
155 psubsw_r2r(r7, r4); \
156 psraw_i2r(4, r1); \
157 movq_m2r(*I(2), r3); \
158 paddsw_r2r(r7, r7); \
159 movq_r2m(r2, *I(2)); \
160 paddsw_r2r(r4, r7); \
161 movq_r2m(r1, *I(1)); \
162 psubsw_r2r(r3, r4); \
163 paddsw_m2r(*Eight, r4); \
164 paddsw_r2r(r3, r3); \
165 paddsw_r2r(r4, r3); \
166 psraw_i2r(4, r4); \
167 psubsw_r2r(r5, r6); \
168 psraw_i2r(4, r3); \
169 paddsw_m2r(*Eight, r6); \
170 paddsw_r2r(r5, r5); \
171 paddsw_r2r(r6, r5); \
172 psraw_i2r(4, r6); \
173 movq_r2m(r4, *J(4)); \
174 psraw_i2r(4, r5); \
175 movq_r2m(r3, *I(3)); \
176 psubsw_r2r(r0, r7); \
177 paddsw_m2r(*Eight, r7); \
178 paddsw_r2r(r0, r0); \
179 paddsw_r2r(r7, r0); \
180 psraw_i2r(4, r7); \
181 movq_r2m(r6, *J(6)); \
182 psraw_i2r(4, r0); \
183 movq_r2m(r5, *J(5)); \
184 movq_r2m(r7, *J(7)); \
185 movq_r2m(r0, *I(0));
186
187
188 /* Following macro does two 4x4 transposes in place.
189
190 At entry (we assume):
191
192 r0 = a3 a2 a1 a0
193 I(1) = b3 b2 b1 b0
194 r2 = c3 c2 c1 c0
195 r3 = d3 d2 d1 d0
196
197 r4 = e3 e2 e1 e0
198 r5 = f3 f2 f1 f0
199 r6 = g3 g2 g1 g0
200 r7 = h3 h2 h1 h0
201
202 At exit, we have:
203
204 I(0) = d0 c0 b0 a0
205 I(1) = d1 c1 b1 a1
206 I(2) = d2 c2 b2 a2
207 I(3) = d3 c3 b3 a3
208
209 J(4) = h0 g0 f0 e0
210 J(5) = h1 g1 f1 e1
211 J(6) = h2 g2 f2 e2
212 J(7) = h3 g3 f3 e3
213
214 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
215 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
216
217 Since r1 is free at entry, we calculate the Js first. */
218
219 #define Transpose() \
220 movq_r2r(r4, r1); \
221 punpcklwd_r2r(r5, r4); \
222 movq_r2m(r0, *I(0)); \
223 punpckhwd_r2r(r5, r1); \
224 movq_r2r(r6, r0); \
225 punpcklwd_r2r(r7, r6); \
226 movq_r2r(r4, r5); \
227 punpckldq_r2r(r6, r4); \
228 punpckhdq_r2r(r6, r5); \
229 movq_r2r(r1, r6); \
230 movq_r2m(r4, *J(4)); \
231 punpckhwd_r2r(r7, r0); \
232 movq_r2m(r5, *J(5)); \
233 punpckhdq_r2r(r0, r6); \
234 movq_m2r(*I(0), r4); \
235 punpckldq_r2r(r0, r1); \
236 movq_m2r(*I(1), r5); \
237 movq_r2r(r4, r0); \
238 movq_r2m(r6, *J(7)); \
239 punpcklwd_r2r(r5, r0); \
240 movq_r2m(r1, *J(6)); \
241 punpckhwd_r2r(r5, r4); \
242 movq_r2r(r2, r5); \
243 punpcklwd_r2r(r3, r2); \
244 movq_r2r(r0, r1); \
245 punpckldq_r2r(r2, r0); \
246 punpckhdq_r2r(r2, r1); \
247 movq_r2r(r4, r2); \
248 movq_r2m(r0, *I(0)); \
249 punpckhwd_r2r(r3, r5); \
250 movq_r2m(r1, *I(1)); \
251 punpckhdq_r2r(r5, r4); \
252 punpckldq_r2r(r5, r2); \
253 movq_r2m(r4, *I(3)); \
254 movq_r2m(r2, *I(2));
255
256
257 void vp3_dsp_init_mmx(void)
258 {
259 int j = 16;
260 uint16_t *p;
261
262 do {
263 idct_constants[--j] = 0;
264 } while (j);
265
266 idct_constants[0] = idct_constants[5] =
267 idct_constants[10] = idct_constants[15] = 65535;
268
269 j = 1;
270 do {
271 p = idct_constants + ((j + 3) << 2);
272 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
273 } while (++j <= 7);
274
275 idct_constants[44] = idct_constants[45] =
276 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
277 }
278
279 static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
280 int16_t *output_data)
281 {
282 /* eax = quantized input
283 * ebx = dequantizer matrix
284 * ecx = IDCT constants
285 * M(I) = ecx + MaskOffset(0) + I * 8
286 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
287 * edx = output
288 * r0..r7 = mm0..mm7
289 */
290
291 #define M(x) (idct_constants + x * 4)
292 #define C(x) (idct_constants + 16 + (x - 1) * 4)
293 #define Eight (idct_constants + 44)
294
295 movq_m2r(*input_data, r0);
296 pmullw_m2r(*dequant_matrix, r0);
297 movq_m2r(*(input_data + 8), r1);
298 pmullw_m2r(*(dequant_matrix + 8), r1);
299 movq_m2r(*M(0), r2);
300 movq_r2r(r0, r3);
301 movq_m2r(*(input_data + 4), r4);
302 psrlq_i2r(16, r0);
303 pmullw_m2r(*(dequant_matrix + 4), r4);
304 pand_r2r(r2, r3);
305 movq_r2r(r0, r5);
306 movq_r2r(r1, r6);
307 pand_r2r(r2, r5);
308 psllq_i2r(32, r6);
309 movq_m2r(*M(3), r7);
310 pxor_r2r(r5, r0);
311 pand_r2r(r6, r7);
312 por_r2r(r3, r0);
313 pxor_r2r(r7, r6);
314 por_r2r(r7, r0);
315 movq_m2r(*M(3), r7);
316 movq_r2r(r4, r3);
317 movq_r2m(r0, *output_data);
318
319 pand_r2r(r2, r3);
320 movq_m2r(*(input_data + 16), r0);
321 psllq_i2r(16, r3);
322 pmullw_m2r(*(dequant_matrix + 16), r0);
323 pand_r2r(r1, r7);
324 por_r2r(r3, r5);
325 por_r2r(r6, r7);
326 movq_m2r(*(input_data + 12), r3);
327 por_r2r(r5, r7);
328 pmullw_m2r(*(dequant_matrix + 12), r3);
329 psrlq_i2r(16, r4);
330 movq_r2m(r7, *(output_data + 8));
331
332 movq_r2r(r4, r5);
333 movq_r2r(r0, r7);
334 psrlq_i2r(16, r4);
335 psrlq_i2r(48, r7);
336 movq_r2r(r2, r6);
337 pand_r2r(r2, r5);
338 pand_r2r(r4, r6);
339 movq_r2m(r7, *(output_data + 40));
340
341 pxor_r2r(r6, r4);
342 psrlq_i2r(32, r1);
343 por_r2r(r5, r4);
344 movq_m2r(*M(3), r7);
345 pand_r2r(r2, r1);
346 movq_m2r(*(input_data + 24), r5);
347 psllq_i2r(16, r0);
348 pmullw_m2r(*(dequant_matrix + 24), r5);
349 pand_r2r(r0, r7);
350 movq_r2m(r1, *(output_data + 32));
351
352 por_r2r(r4, r7);
353 movq_r2r(r3, r4);
354 pand_r2r(r2, r3);
355 movq_m2r(*M(2), r1);
356 psllq_i2r(32, r3);
357 por_r2r(r3, r7);
358 movq_r2r(r5, r3);
359 psllq_i2r(48, r3);
360 pand_r2r(r0, r1);
361 movq_r2m(r7, *(output_data + 16));
362
363 por_r2r(r3, r6);
364 movq_m2r(*M(1), r7);
365 por_r2r(r1, r6);
366 movq_m2r(*(input_data + 28), r1);
367 pand_r2r(r4, r7);
368 pmullw_m2r(*(dequant_matrix + 28), r1);
369 por_r2r(r6, r7);
370 pand_m2r(*M(1), r0);
371 psrlq_i2r(32, r4);
372 movq_r2m(r7, *(output_data + 24));
373
374 movq_r2r(r4, r6);
375 movq_m2r(*M(3), r7);
376 pand_r2r(r2, r4);
377 movq_m2r(*M(1), r3);
378 pand_r2r(r1, r7);
379 pand_r2r(r5, r3);
380 por_r2r(r4, r0);
381 psllq_i2r(16, r3);
382 por_r2r(r0, r7);
383 movq_m2r(*M(2), r4);
384 por_r2r(r3, r7);
385 movq_m2r(*(input_data + 40), r0);
386 movq_r2r(r4, r3);
387 pmullw_m2r(*(dequant_matrix + 40), r0);
388 pand_r2r(r5, r4);
389 movq_r2m(r7, *(output_data + 4));
390
391 por_r2r(r4, r6);
392 movq_r2r(r3, r4);
393 psrlq_i2r(16, r6);
394 movq_r2r(r0, r7);
395 pand_r2r(r1, r4);
396 psllq_i2r(48, r7);
397 por_r2r(r4, r6);
398 movq_m2r(*(input_data + 44), r4);
399 por_r2r(r6, r7);
400 pmullw_m2r(*(dequant_matrix + 44), r4);
401 psrlq_i2r(16, r3);
402 movq_r2m(r7, *(output_data + 12));
403
404 pand_r2r(r1, r3);
405 psrlq_i2r(48, r5);
406 pand_r2r(r2, r1);
407 movq_m2r(*(input_data + 52), r6);
408 por_r2r(r3, r5);
409 pmullw_m2r(*(input_data + 52), r6);
410 psrlq_i2r(16, r0);
411 movq_r2r(r4, r7);
412 movq_r2r(r2, r3);
413 psllq_i2r(48, r7);
414 pand_r2r(r0, r3);
415 pxor_r2r(r3, r0);
416 psllq_i2r(32, r3);
417 por_r2r(r5, r7);
418 movq_r2r(r6, r5);
419 pand_m2r(*M(1), r6);
420 por_r2r(r3, r7);
421 psllq_i2r(32, r6);
422 por_r2r(r1, r0);
423 movq_r2m(r7, *(output_data + 20));
424
425 por_r2r(r6, r0);
426 movq_m2r(*(input_data + 60), r7);
427 movq_r2r(r5, r6);
428 pmullw_m2r(*(input_data + 60), r7);
429 psrlq_i2r(32, r5);
430 pand_r2r(r2, r6);
431 movq_r2r(r5, r1);
432 movq_r2m(r0, *(output_data + 28));
433
434 pand_r2r(r2, r1);
435 movq_m2r(*(input_data + 56), r0);
436 movq_r2r(r7, r3);
437 pmullw_m2r(*(dequant_matrix + 56), r0);
438 psllq_i2r(16, r3);
439 pand_m2r(*M(3), r7);
440 pxor_r2r(r1, r5);
441 por_r2r(r5, r6);
442 movq_r2r(r3, r5);
443 pand_m2r(*M(3), r5);
444 por_r2r(r1, r7);
445 movq_m2r(*(input_data + 48), r1);
446 pxor_r2r(r5, r3);
447 pmullw_m2r(*(dequant_matrix + 48), r1);
448 por_r2r(r3, r7);
449 por_r2r(r5, r6);
450 movq_r2r(r0, r5);
451 movq_r2m(r7, *(output_data + 60));
452
453 psrlq_i2r(16, r5);
454 pand_m2r(*M(2), r5);
455 movq_r2r(r0, r7);
456 por_r2r(r5, r6);
457 pand_r2r(r2, r0);
458 pxor_r2r(r0, r7);
459 psllq_i2r(32, r0);
460 movq_r2m(r6, *(output_data + 52));
461
462 psrlq_i2r(16, r4);
463 movq_m2r(*(input_data + 36), r5);
464 psllq_i2r(16, r7);
465 pmullw_m2r(*(dequant_matrix + 36), r5);
466 movq_r2r(r7, r6);
467 movq_m2r(*M(2), r3);
468 psllq_i2r(16, r6);
469 pand_m2r(*M(3), r7);
470 pand_r2r(r1, r3);
471 por_r2r(r0, r7);
472 movq_r2r(r1, r0);
473 pand_m2r(*M(3), r1);
474 por_r2r(r3, r6);
475 movq_r2r(r4, r3);
476 psrlq_i2r(32, r1);
477 pand_r2r(r2, r3);
478 por_r2r(r1, r7);
479 por_r2r(r3, r7);
480 movq_r2r(r4, r3);
481 pand_m2r(*M(1), r3);
482 movq_r2r(r5, r1);
483 movq_r2m(r7, *(output_data + 44));
484
485 psrlq_i2r(48, r5);
486 movq_m2r(*(input_data + 32), r7);
487 por_r2r(r3, r6);
488 pmullw_m2r(*(dequant_matrix + 32), r7);
489 por_r2r(r5, r6);
490 pand_m2r(*M(2), r4);
491 psllq_i2r(32, r0);
492 movq_r2m(r6, *(output_data + 36));
493
494 movq_r2r(r0, r6);
495 pand_m2r(*M(3), r0);
496 psllq_i2r(16, r6);
497 movq_m2r(*(input_data + 20), r5);
498 movq_r2r(r1, r3);
499 pmullw_m2r(*(dequant_matrix + 40), r5);
500 psrlq_i2r(16, r1);
501 pand_m2r(*M(1), r1);
502 por_r2r(r4, r0);
503 pand_r2r(r7, r2);
504 por_r2r(r1, r0);
505 por_r2r(r2, r0);
506 psllq_i2r(16, r3);
507 movq_r2r(r3, r4);
508 movq_r2r(r5, r2);
509 movq_r2m(r0, *(output_data + 56));
510
511 psrlq_i2r(48, r2);
512 pand_m2r(*M(2), r4);
513 por_r2r(r2, r6);
514 movq_m2r(*M(1), r2);
515 por_r2r(r4, r6);
516 pand_r2r(r7, r2);
517 psllq_i2r(32, r3);
518 por_m2r(*(output_data + 40), r3);
519
520 por_r2r(r2, r6);
521 movq_m2r(*M(3), r2);
522 psllq_i2r(16, r5);
523 movq_r2m(r6, *(output_data + 48));
524
525 pand_r2r(r5, r2);
526 movq_m2r(*M(2), r6);
527 pxor_r2r(r2, r5);
528 pand_r2r(r7, r6);
529 psrlq_i2r(32, r2);
530 pand_m2r(*M(3), r7);
531 por_r2r(r2, r3);
532 por_m2r(*(output_data + 32), r7);
533
534 por_r2r(r3, r6);
535 por_r2r(r5, r7);
536 movq_r2m(r6, *(output_data + 40));
537 movq_r2m(r7, *(output_data + 32));
538
539
540 #undef M
541
542 /* at this point, function has completed dequantization + dezigzag +
543 * partial transposition; now do the idct itself */
544
545 #define I(K) (output_data + K * 8)
546 #define J(K) (output_data + ((K - 4) * 8) + 4)
547
548 RowIDCT();
549 Transpose();
550
551 #undef I
552 #undef J
553 #define I(K) (output_data + (K * 8) + 32)
554 #define J(K) (output_data + ((K - 4) * 8) + 36)
555
556 RowIDCT();
557 Transpose();
558
559 #undef I
560 #undef J
561 #define I(K) (output_data + K * 8)
562 #define J(K) (output_data + K * 8)
563
564 ColumnIDCT();
565
566 #undef I
567 #undef J
568 #define I(K) (output_data + (K * 8) + 4)
569 #define J(K) (output_data + (K * 8) + 4)
570
571 ColumnIDCT();
572
573 #undef I
574 #undef J
575
576 }
577
578 void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
579 int coeff_count, uint8_t *dest, int stride)
580 {
581 int16_t transformed_data[64];
582 int16_t *op;
583 int i, j;
584 uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
585
586 vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
587
588 /* place in final output */
589 op = transformed_data;
590 movq_m2r(*vector128, mm0);
591 for (i = 0; i < 8; i++) {
592 #if 1
593 for (j = 0; j < 8; j++) {
594 if (*op < -128)
595 *dest = 0;
596 else if (*op > 127)
597 *dest = 255;
598 else
599 *dest = (uint8_t)(*op + 128);
600 op++;
601 dest++;
602 }
603 dest += (stride - 8);
604 #else
605 /* prototype optimization */
606 pxor_r2r(mm1, mm1);
607 packsswb_m2r(*(op + 4), mm1);
608 movq_r2r(mm1, mm2);
609 psrlq_i2r(32, mm2);
610 packsswb_m2r(*(op + 0), mm1);
611 op += 8;
612 por_r2r(mm2, mm1);
613 paddb_r2r(mm0, mm1);
614 movq_r2m(mm1, *dest);
615 dest += stride;
616 #endif
617 }
618
619 /* be a good MMX citizen */
620 emms();
621 }
622
623 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
624 int coeff_count, uint8_t *dest, int stride)
625 {
626 int16_t transformed_data[64];
627 int16_t *op;
628 int i, j;
629 int16_t sample;
630
631 vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
632
633 /* place in final output */
634 op = transformed_data;
635 for (i = 0; i < 8; i++) {
636 for (j = 0; j < 8; j++) {
637 sample = *dest + *op;
638 if (sample < 0)
639 *dest = 0;
640 else if (sample > 255)
641 *dest = 255;
642 else
643 *dest = (uint8_t)(sample & 0xFF);
644 op++;
645 dest++;
646 }
647 dest += (stride - 8);
648 }
649
650 /* be a good MMX citizen */
651 emms();
652 }