Commit | Line | Data |
---|---|---|
de6d9b64 FB |
1 | ; ////////////////////////////////////////////////////////////////////////////// |
2 | ; // | |
3 | ; // fdctam32.c - AP922 MMX(3D-Now) forward-DCT | |
4 | ; // ---------- | |
5 | ; // Intel Application Note AP-922 - fast, precise implementation of DCT | |
6 | ; // http://developer.intel.com/vtune/cbts/appnotes.htm | |
7 | ; // ---------- | |
8 | ; // | |
9 | ; // This routine can use a 3D-Now/MMX enhancement to increase the | |
10 | ; // accuracy of the fdct_col_4 macro. The dct_col function uses 3D-Now's | |
11 | ; // PMHULHRW instead of MMX's PMHULHW(and POR). The substitution improves | |
12 | ; // accuracy very slightly with performance penalty. If the target CPU | |
13 | ; // does not support 3D-Now, then this function cannot be executed. | |
14 | ; // | |
15 | ; // For a fast, precise MMX implementation of inverse-DCT | |
16 | ; // visit http://www.elecard.com/peter | |
17 | ; // | |
18 | ; // v1.0 07/22/2000 (initial release) | |
19 | ; // | |
20 | ; // liaor@iname.com http://members.tripod.com/~liaor | |
21 | ; ////////////////////////////////////////////////////////////////////////////// | |
22 | ||
23 | ;;; | |
24 | ;;; A.Stevens Jul 2000: ported to nasm syntax and disentangled from | |
25 | ;;; from Win**** compiler specific stuff. | |
26 | ;;; All the real work was done above though. | |
27 | ;;; See above for how to optimise quality on 3DNow! CPU's | |
28 | ||
29 | ;; | |
30 | ;; Macros for code-readability... | |
31 | ;; | |
32 | %define INP eax ; pointer to (short *blk) | |
33 | %define OUT ecx ; pointer to output (temporary store space qwTemp[]) | |
34 | %define TABLE ebx ; pointer to tab_frw_01234567[] | |
35 | %define TABLEF ebx ; pointer to tg_all_16 | |
36 | %define round_frw_row edx | |
37 | ||
38 | ||
39 | %define x0 INP + 0*16 | |
40 | %define x1 INP + 1*16 | |
41 | %define x2 INP + 2*16 | |
42 | %define x3 INP + 3*16 | |
43 | %define x4 INP + 4*16 | |
44 | %define x5 INP + 5*16 | |
45 | %define x6 INP + 6*16 | |
46 | %define x7 INP + 7*16 | |
47 | %define y0 OUT + 0*16 | |
48 | %define y1 OUT + 1*16 | |
49 | %define y2 OUT + 2*16 | |
50 | %define y3 OUT + 3*16 | |
51 | %define y4 OUT + 4*16 | |
52 | %define y5 OUT + 5*16 | |
53 | %define y6 OUT + 6*16 | |
54 | %define y7 OUT + 7*16 | |
55 | ||
56 | ;; | |
57 | ;; Constants for DCT | |
58 | ;; | |
59 | %define BITS_FRW_ACC 3 ; 2 or 3 for accuracy | |
60 | %define SHIFT_FRW_COL BITS_FRW_ACC | |
61 | %define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) | |
62 | %define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
63 | %define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
64 | ||
65 | extern fdct_one_corr | |
66 | extern fdct_r_row ; Defined in C for convenience | |
67 | ;; | |
68 | ;; Concatenated table of forward dct transformation coeffs. | |
69 | ;; | |
70 | extern fdct_tg_all_16 ; Defined in C for convenience | |
71 | ;; Offsets into table.. | |
72 | ||
73 | %define tg_1_16 (TABLEF + 0) | |
74 | %define tg_2_16 (TABLEF + 8) | |
75 | %define tg_3_16 (TABLEF + 16) | |
76 | %define cos_4_16 (TABLEF + 24) | |
77 | %define ocos_4_16 (TABLEF + 32) | |
78 | ||
79 | ;; | |
80 | ;; Concatenated table of forward dct coefficients | |
81 | ;; | |
82 | extern tab_frw_01234567 ; Defined in C for convenience | |
83 | ||
84 | ;; Offsets into table.. | |
85 | SECTION .text | |
86 | ||
87 | global fdct_mmx | |
88 | ||
89 | ;;; | |
90 | ;;; void fdct_mmx( short *blk ) | |
91 | ;;; | |
92 | ||
93 | ||
94 | ||
95 | ; //////////////////////////////////////////////////////////////////////// | |
96 | ; // | |
97 | ; // The high-level pseudocode for the fdct_am32() routine : | |
98 | ; // | |
99 | ; // fdct_am32() | |
100 | ; // { | |
101 | ; // forward_dct_col03(); // dct_column transform on cols 0-3 | |
102 | ; // forward_dct_col47(); // dct_column transform on cols 4-7 | |
103 | ; // for ( j = 0; j < 8; j=j+1 ) | |
104 | ; // forward_dct_row1(j); // dct_row transform on row #j | |
105 | ; // } | |
106 | ; // | |
107 | ; | |
108 | ||
109 | align 32 | |
110 | fdct_mmx: | |
111 | push ebp ; save stack pointer | |
112 | mov ebp, esp ; link | |
113 | ||
114 | push ebx | |
115 | push ecx | |
116 | push edx | |
117 | push edi | |
118 | ||
119 | mov INP, [ebp+8]; ; input data is row 0 of blk[] | |
120 | ;// transform the left half of the matrix (4 columns) | |
121 | ||
122 | lea TABLEF, [fdct_tg_all_16]; | |
123 | mov OUT, INP; | |
124 | ||
125 | ; lea round_frw_col, [r_frw_col] | |
126 | ; for ( i = 0; i < 2; i = i + 1) | |
127 | ; the for-loop is executed twice. We are better off unrolling the | |
128 | ; loop to avoid branch misprediction. | |
129 | .mmx32_fdct_col03: | |
130 | movq mm0, [x1] ; 0 ; x1 | |
131 | ;; | |
132 | ||
133 | movq mm1, [x6] ; 1 ; x6 | |
134 | movq mm2, mm0 ; 2 ; x1 | |
135 | ||
136 | movq mm3, [x2] ; 3 ; x2 | |
137 | paddsw mm0, mm1 ; t1 = x[1] + x[6] | |
138 | ||
139 | movq mm4, [x5] ; 4 ; x5 | |
140 | psllw mm0, SHIFT_FRW_COL ; t1 | |
141 | ||
142 | movq mm5, [x0] ; 5 ; x0 | |
143 | paddsw mm4, mm3 ; t2 = x[2] + x[5] | |
144 | ||
145 | paddsw mm5, [x7] ; t0 = x[0] + x[7] | |
146 | psllw mm4, SHIFT_FRW_COL ; t2 | |
147 | ||
148 | movq mm6, mm0 ; 6 ; t1 | |
149 | psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] | |
150 | ||
151 | movq mm1, [tg_2_16] ; 1 ; tg_2_16 | |
152 | psubsw mm0, mm4 ; tm12 = t1 - t2 | |
153 | ||
154 | movq mm7, [x3] ; 7 ; x3 | |
155 | pmulhw mm1, mm0 ; tm12*tg_2_16 | |
156 | ||
157 | paddsw mm7, [x4] ; t3 = x[3] + x[4] | |
158 | psllw mm5, SHIFT_FRW_COL ; t0 | |
159 | ||
160 | paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 | |
161 | psllw mm7, SHIFT_FRW_COL ; t3 | |
162 | ||
163 | movq mm4, mm5 ; 4 ; t0 | |
164 | psubsw mm5, mm7 ; tm03 = t0 - t3 | |
165 | ||
166 | paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 | |
167 | paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 | |
168 | ||
169 | por mm1, [fdct_one_corr] ; correction y2 +0.5 | |
170 | psllw mm2, SHIFT_FRW_COL+1 ; t6 | |
171 | ||
172 | pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 | |
173 | movq mm7, mm4 ; 7 ; tp03 | |
174 | ||
175 | psubsw mm3, [x5] ; t5 = x[2] - x[5] | |
176 | psubsw mm4, mm6 ; y4 = tp03 - tp12 | |
177 | ||
178 | movq [y2], mm1 ; 1 ; save y2 | |
179 | paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 | |
180 | ||
181 | movq mm1, [x3] ; 1 ; x3 | |
182 | psllw mm3, SHIFT_FRW_COL+1 ; t5 | |
183 | ||
184 | psubsw mm1, [x4] ; t4 = x[3] - x[4] | |
185 | movq mm6, mm2 ; 6 ; t6 | |
186 | ||
187 | movq [y4], mm4 ; 4 ; save y4 | |
188 | paddsw mm2, mm3 ; t6 + t5 | |
189 | ||
190 | pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 | |
191 | psubsw mm6, mm3 ; 3 ; t6 - t5 | |
192 | ||
193 | pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 | |
194 | psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 | |
195 | ||
196 | por mm5, [fdct_one_corr] ; correction y6 +0.5 | |
197 | psllw mm1, SHIFT_FRW_COL ; t4 | |
198 | ||
199 | por mm2, [fdct_one_corr] ; correction tp65 +0.5 | |
200 | movq mm4, mm1 ; 4 ; t4 | |
201 | ||
202 | movq mm3, [x0] ; 3 ; x0 | |
203 | paddsw mm1, mm6 ; tp465 = t4 + tm65 | |
204 | ||
205 | psubsw mm3, [x7] ; t7 = x[0] - x[7] | |
206 | psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 | |
207 | ||
208 | movq mm0, [tg_1_16] ; 0 ; tg_1_16 | |
209 | psllw mm3, SHIFT_FRW_COL ; t7 | |
210 | ||
211 | movq mm6, [tg_3_16] ; 6 ; tg_3_16 | |
212 | pmulhw mm0, mm1 ; tp465*tg_1_16 | |
213 | ||
214 | movq [y0], mm7 ; 7 ; save y0 | |
215 | pmulhw mm6, mm4 ; tm465*tg_3_16 | |
216 | ||
217 | movq [y6], mm5 ; 5 ; save y6 | |
218 | movq mm7, mm3 ; 7 ; t7 | |
219 | ||
220 | movq mm5, [tg_3_16] ; 5 ; tg_3_16 | |
221 | psubsw mm7, mm2 ; tm765 = t7 - tp65 | |
222 | ||
223 | paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 | |
224 | pmulhw mm5, mm7 ; tm765*tg_3_16 | |
225 | ||
226 | paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 | |
227 | paddsw mm6, mm4 ; tm465*tg_3_16 | |
228 | ||
229 | pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 | |
230 | ;; | |
231 | ||
232 | por mm0, [fdct_one_corr] ; correction y1 +0.5 | |
233 | paddsw mm5, mm7 ; tm765*tg_3_16 | |
234 | ||
235 | psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 | |
236 | add INP, 0x08 ; ; increment pointer | |
237 | ||
238 | movq [y1], mm0 ; 0 ; save y1 | |
239 | paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 | |
240 | ||
241 | movq [y3], mm7 ; 7 ; save y3 | |
242 | psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 | |
243 | ||
244 | movq [y5], mm5 ; 5 ; save y5 | |
245 | ||
246 | ||
247 | .mmx32_fdct_col47: ; begin processing last four columns | |
248 | movq mm0, [x1] ; 0 ; x1 | |
249 | ;; | |
250 | movq [y7], mm3 ; 3 ; save y7 (columns 0-4) | |
251 | ;; | |
252 | ||
253 | movq mm1, [x6] ; 1 ; x6 | |
254 | movq mm2, mm0 ; 2 ; x1 | |
255 | ||
256 | movq mm3, [x2] ; 3 ; x2 | |
257 | paddsw mm0, mm1 ; t1 = x[1] + x[6] | |
258 | ||
259 | movq mm4, [x5] ; 4 ; x5 | |
260 | psllw mm0, SHIFT_FRW_COL ; t1 | |
261 | ||
262 | movq mm5, [x0] ; 5 ; x0 | |
263 | paddsw mm4, mm3 ; t2 = x[2] + x[5] | |
264 | ||
265 | paddsw mm5, [x7] ; t0 = x[0] + x[7] | |
266 | psllw mm4, SHIFT_FRW_COL ; t2 | |
267 | ||
268 | movq mm6, mm0 ; 6 ; t1 | |
269 | psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] | |
270 | ||
271 | movq mm1, [tg_2_16] ; 1 ; tg_2_16 | |
272 | psubsw mm0, mm4 ; tm12 = t1 - t2 | |
273 | ||
274 | movq mm7, [x3] ; 7 ; x3 | |
275 | pmulhw mm1, mm0 ; tm12*tg_2_16 | |
276 | ||
277 | paddsw mm7, [x4] ; t3 = x[3] + x[4] | |
278 | psllw mm5, SHIFT_FRW_COL ; t0 | |
279 | ||
280 | paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 | |
281 | psllw mm7, SHIFT_FRW_COL ; t3 | |
282 | ||
283 | movq mm4, mm5 ; 4 ; t0 | |
284 | psubsw mm5, mm7 ; tm03 = t0 - t3 | |
285 | ||
286 | paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 | |
287 | paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 | |
288 | ||
289 | por mm1, [fdct_one_corr] ; correction y2 +0.5 | |
290 | psllw mm2, SHIFT_FRW_COL+1 ; t6 | |
291 | ||
292 | pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 | |
293 | movq mm7, mm4 ; 7 ; tp03 | |
294 | ||
295 | psubsw mm3, [x5] ; t5 = x[2] - x[5] | |
296 | psubsw mm4, mm6 ; y4 = tp03 - tp12 | |
297 | ||
298 | movq [y2+8], mm1 ; 1 ; save y2 | |
299 | paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 | |
300 | ||
301 | movq mm1, [x3] ; 1 ; x3 | |
302 | psllw mm3, SHIFT_FRW_COL+1 ; t5 | |
303 | ||
304 | psubsw mm1, [x4] ; t4 = x[3] - x[4] | |
305 | movq mm6, mm2 ; 6 ; t6 | |
306 | ||
307 | movq [y4+8], mm4 ; 4 ; save y4 | |
308 | paddsw mm2, mm3 ; t6 + t5 | |
309 | ||
310 | pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 | |
311 | psubsw mm6, mm3 ; 3 ; t6 - t5 | |
312 | ||
313 | pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 | |
314 | psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 | |
315 | ||
316 | por mm5, [fdct_one_corr] ; correction y6 +0.5 | |
317 | psllw mm1, SHIFT_FRW_COL ; t4 | |
318 | ||
319 | por mm2, [fdct_one_corr] ; correction tp65 +0.5 | |
320 | movq mm4, mm1 ; 4 ; t4 | |
321 | ||
322 | movq mm3, [x0] ; 3 ; x0 | |
323 | paddsw mm1, mm6 ; tp465 = t4 + tm65 | |
324 | ||
325 | psubsw mm3, [x7] ; t7 = x[0] - x[7] | |
326 | psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 | |
327 | ||
328 | movq mm0, [tg_1_16] ; 0 ; tg_1_16 | |
329 | psllw mm3, SHIFT_FRW_COL ; t7 | |
330 | ||
331 | movq mm6, [tg_3_16] ; 6 ; tg_3_16 | |
332 | pmulhw mm0, mm1 ; tp465*tg_1_16 | |
333 | ||
334 | movq [y0+8], mm7 ; 7 ; save y0 | |
335 | pmulhw mm6, mm4 ; tm465*tg_3_16 | |
336 | ||
337 | movq [y6+8], mm5 ; 5 ; save y6 | |
338 | movq mm7, mm3 ; 7 ; t7 | |
339 | ||
340 | movq mm5, [tg_3_16] ; 5 ; tg_3_16 | |
341 | psubsw mm7, mm2 ; tm765 = t7 - tp65 | |
342 | ||
343 | paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 | |
344 | pmulhw mm5, mm7 ; tm765*tg_3_16 | |
345 | ||
346 | paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 | |
347 | paddsw mm6, mm4 ; tm465*tg_3_16 | |
348 | ||
349 | pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 | |
350 | ;; | |
351 | ||
352 | por mm0, [fdct_one_corr] ; correction y1 +0.5 | |
353 | paddsw mm5, mm7 ; tm765*tg_3_16 | |
354 | ||
355 | psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 | |
356 | ;; | |
357 | ||
358 | movq [y1+8], mm0 ; 0 ; save y1 | |
359 | paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 | |
360 | ||
361 | movq [y3+8], mm7 ; 7 ; save y3 | |
362 | psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 | |
363 | ||
364 | movq [y5+8], mm5 ; 5 ; save y5 | |
365 | ||
366 | movq [y7+8], mm3 ; 3 ; save y7 | |
367 | ||
368 | ; emms; | |
369 | ; } ; end of forward_dct_col07() | |
370 | ; done with dct_row transform | |
371 | ||
372 | ||
373 | ; fdct_mmx32_cols() -- | |
374 | ; the following subroutine repeats the row-transform operation, | |
375 | ; except with different shift&round constants. This version | |
376 | ; does NOT transpose the output again. Thus the final output | |
377 | ; is transposed with respect to the source. | |
378 | ; | |
379 | ; The output is stored into blk[], which destroys the original | |
380 | ; input data. | |
381 | mov INP, [ebp+8]; ;; row 0 | |
382 | mov edi, 0x08; ;x = 8 | |
383 | ||
384 | lea TABLE, [tab_frw_01234567]; ; row 0 | |
385 | mov OUT, INP; | |
386 | ||
387 | lea round_frw_row, [fdct_r_row]; | |
388 | ; for ( x = 8; x > 0; --x ) ; transform one row per iteration | |
389 | ||
390 | ; ---------- loop begin | |
391 | .lp_mmx_fdct_row1: | |
392 | movd mm5, [INP+12]; ; mm5 = 7 6 | |
393 | ||
394 | punpcklwd mm5, [INP+8] ; mm5 = 5 7 4 6 | |
395 | ||
396 | movq mm2, mm5; ; mm2 = 5 7 4 6 | |
397 | psrlq mm5, 32; ; mm5 = _ _ 5 7 | |
398 | ||
399 | movq mm0, [INP]; ; mm0 = 3 2 1 0 | |
400 | punpcklwd mm5, mm2;; mm5 = 4 5 6 7 | |
401 | ||
402 | movq mm1, mm0; ; mm1 = 3 2 1 0 | |
403 | paddsw mm0, mm5; ; mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) | |
404 | ||
405 | psubsw mm1, mm5; ; mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) | |
406 | movq mm2, mm0; ; mm2 = [ xt3 xt2 xt1 xt0 ] | |
407 | ||
408 | ;movq [ xt3xt2xt1xt0 ], mm0; | |
409 | ;movq [ xt7xt6xt5xt4 ], mm1; | |
410 | ||
411 | punpcklwd mm0, mm1;; mm0 = [ xt5 xt1 xt4 xt0 ] | |
412 | ||
413 | punpckhwd mm2, mm1;; mm2 = [ xt7 xt3 xt6 xt2 ] | |
414 | movq mm1, mm2; ; mm1 | |
415 | ||
416 | ;; shuffle bytes around | |
417 | ||
418 | ; movq mm0, [INP] ; 0 ; x3 x2 x1 x0 | |
419 | ||
420 | ; movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4 | |
421 | movq mm2, mm0 ; 2 ; x3 x2 x1 x0 | |
422 | ||
423 | movq mm3, [TABLE] ; 3 ; w06 w04 w02 w00 | |
424 | punpcklwd mm0, mm1 ; x5 x1 x4 x0 | |
425 | ||
426 | movq mm5, mm0 ; 5 ; x5 x1 x4 x0 | |
427 | punpckldq mm0, mm0 ; x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ] | |
428 | ||
429 | movq mm4, [TABLE+8] ; 4 ; w07 w05 w03 w01 | |
430 | punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 | |
431 | ||
432 | pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 | |
433 | movq mm6, mm2 ; 6 ; x7 x3 x6 x2 | |
434 | ||
435 | movq mm1, [TABLE+32] ; 1 ; w22 w20 w18 w16 | |
436 | punpckldq mm2, mm2 ; x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ] | |
437 | ||
438 | pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 | |
439 | punpckhdq mm5, mm5 ; x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ] | |
440 | ||
441 | pmaddwd mm0, [TABLE+16] ; x4*w14+x0*w12 x4*w10+x0*w08 | |
442 | punpckhdq mm6, mm6 ; x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ] | |
443 | ||
444 | movq mm7, [TABLE+40] ; 7 ; w23 w21 w19 w17 | |
445 | pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 | |
446 | ;mm3 = a1, a0 (y2,y0) | |
447 | ;mm1 = b1, b0 (y3,y1) | |
448 | ;mm0 = a3,a2 (y6,y4) | |
449 | ;mm5 = b3,b2 (y7,y5) | |
450 | ||
451 | paddd mm3, [round_frw_row] ; +rounder (y2,y0) | |
452 | pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 | |
453 | ||
454 | pmaddwd mm2, [TABLE+24] ; x6*w15+x2*w13 x6*w11+x2*w09 | |
455 | paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0) | |
456 | ||
457 | pmaddwd mm5, [TABLE+48] ; x5*w30+x1*w28 x5*w26+x1*w24 | |
458 | ;; | |
459 | ||
460 | pmaddwd mm6, [TABLE+56] ; x7*w31+x3*w29 x7*w27+x3*w25 | |
461 | paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1) | |
462 | ||
463 | paddd mm0, [round_frw_row] ; +rounder (y6,y4) | |
464 | psrad mm3, SHIFT_FRW_ROW ; (y2, y0) | |
465 | ||
466 | paddd mm1, [round_frw_row] ; +rounder (y3,y1) | |
467 | paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4) | |
468 | ||
469 | paddd mm5, [round_frw_row] ; +rounder (y7,y5) | |
470 | psrad mm1, SHIFT_FRW_ROW ; y1=a1+b1 y0=a0+b0 | |
471 | ||
472 | paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5) | |
473 | psrad mm0, SHIFT_FRW_ROW ;y3=a3+b3 y2=a2+b2 | |
474 | ||
475 | add OUT, 16; ; increment row-output address by 1 row | |
476 | psrad mm5, SHIFT_FRW_ROW ; y4=a3-b3 y5=a2-b2 | |
477 | ||
478 | add INP, 16; ; increment row-address by 1 row | |
479 | packssdw mm3, mm0 ; 0 ; y6 y4 y2 y0 | |
480 | ||
481 | packssdw mm1, mm5 ; 3 ; y7 y5 y3 y1 | |
482 | movq mm6, mm3; ; mm0 = y6 y4 y2 y0 | |
483 | ||
484 | punpcklwd mm3, mm1; ; y3 y2 y1 y0 | |
485 | sub edi, 0x01; ; i = i - 1 | |
486 | ||
487 | punpckhwd mm6, mm1; ; y7 y6 y5 y4 | |
488 | add TABLE,64; ; increment to next table | |
489 | ||
490 | movq [OUT-16], mm3 ; 1 ; save y3 y2 y1 y0 | |
491 | ||
492 | movq [OUT-8], mm6 ; 7 ; save y7 y6 y5 y4 | |
493 | ||
494 | cmp edi, 0x00; | |
495 | jg near .lp_mmx_fdct_row1; ; begin fdct processing on next row | |
496 | ;; | |
497 | ;; Tidy up and return | |
498 | ;; | |
499 | pop edi | |
500 | pop edx | |
501 | pop ecx | |
502 | pop ebx | |
503 | ||
504 | pop ebp ; restore stack pointer | |
505 | emms | |
506 | ret | |
507 |