suppressed no longer needed emms()
[libav.git] / libavcodec / i386 / sad_mmx.s
1 ; MMX/SSE optimized routines for SAD of 16*16 macroblocks
2 ; Copyright (C) Juan J. Sierralta P. <juanjo@atmlab.utfsm.cl>
3 ;
4 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
5 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
6
7 ;
8 ; This program is free software; you can redistribute it and/or
9 ; modify it under the terms of the GNU General Public License
10 ; as published by the Free Software Foundation; either version 2
11 ; of the License, or (at your option) any later version.
12 ;
13 ; This program is distributed in the hope that it will be useful,
14 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ; GNU General Public License for more details.
17 ;
18 ; You should have received a copy of the GNU General Public License
19 ; along with this program; if not, write to the Free Software
20 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ;
22
23 global pix_abs16x16_mmx
24
25 ; int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
26 ; esi = p1 (init: blk1)
27 ; edi = p2 (init: blk2)
28 ; ecx = rowsleft (init: h)
29 ; edx = lx;
30
31 ; mm0 = distance accumulators (4 words)
32 ; mm1 = distance accumulators (4 words)
33 ; mm2 = temp
34 ; mm3 = temp
35 ; mm4 = temp
36 ; mm5 = temp
37 ; mm6 = 0
38 ; mm7 = temp
39
40
41 align 32
42 pix_abs16x16_mmx:
43 push ebp ; save frame pointer
44 mov ebp, esp
45
46 push ebx ; Saves registers (called saves convention in
47 push ecx ; x86 GCC it seems)
48 push edx ;
49 push esi
50 push edi
51
52 pxor mm0, mm0 ; zero acculumators
53 pxor mm1, mm1
54 pxor mm6, mm6
55 mov esi, [ebp+8] ; get pix1
56 mov edi, [ebp+12] ; get pix2
57 mov edx, [ebp+16] ; get lx
58 mov ecx, [ebp+20] ; get rowsleft
59 jmp .nextrow
60 align 32
61
62 .nextrow:
63 ; First 8 bytes of the row
64
65 movq mm4, [edi] ; load first 8 bytes of pix2 row
66 movq mm5, [esi] ; load first 8 bytes of pix1 row
67 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
68 movq mm2,[esi+8] ; load last 8 bytes of pix1 row
69 psubusb mm4, mm5
70 movq mm7,[edi+8] ; load last 8 bytes of pix2 row
71 psubusb mm5, mm3
72 por mm4, mm5
73
74 ; Last 8 bytes of the row
75
76 movq mm3, mm7 ; mm7 := abs(mm7-mm2)
77 psubusb mm7, mm2
78 psubusb mm2, mm3
79 por mm7, mm2
80
81 ; Now mm4 and mm7 have 16 absdiffs to add
82
83 ; First 8 bytes of the row2
84
85
86 add edi, edx
87 movq mm2, [edi] ; load first 8 bytes of pix2 row
88 add esi, edx
89 movq mm5, [esi] ; load first 8 bytes of pix1 row
90
91
92
93 movq mm3, mm2 ; mm2 := abs(mm2-mm5)
94 psubusb mm2, mm5
95 movq mm6,[esi+8] ; load last 8 bytes of pix1 row
96 psubusb mm5, mm3
97 por mm2, mm5
98
99 ; Last 8 bytes of the row2
100
101 movq mm5,[edi+8] ; load last 8 bytes of pix2 row
102
103
104 movq mm3, mm5 ; mm5 := abs(mm5-mm6)
105 psubusb mm5, mm6
106 psubusb mm6, mm3
107 por mm5, mm6
108
109 ; Now mm2, mm4, mm5, mm7 have 32 absdiffs
110
111 movq mm3, mm7
112
113 pxor mm6, mm6 ; Zero mm6
114
115 punpcklbw mm3, mm6 ; Unpack to words and add
116 punpckhbw mm7, mm6
117 paddusw mm7, mm3
118
119 movq mm3, mm5
120
121 punpcklbw mm3, mm6 ; Unpack to words and add
122 punpckhbw mm5, mm6
123 paddusw mm5, mm3
124
125 paddusw mm0, mm7 ; Add to the acumulator (mm0)
126 paddusw mm1, mm5 ; Add to the acumulator (mm1)
127
128 movq mm3, mm4
129
130 punpcklbw mm3, mm6 ; Unpack to words and add
131 punpckhbw mm4, mm6
132 movq mm5, mm2
133 paddusw mm4, mm3
134
135
136
137 punpcklbw mm5, mm6 ; Unpack to words and add
138 punpckhbw mm2, mm6
139 paddusw mm2, mm5
140
141 ; Loop termination
142
143 add esi, edx ; update pointers to next row
144 paddusw mm0, mm4 ; Add to the acumulator (mm0)
145 add edi, edx
146 sub ecx,2
147 paddusw mm1, mm2 ; Add to the acumulator (mm1)
148 test ecx, ecx ; check rowsleft
149 jnz near .nextrow
150
151 paddusw mm0, mm1
152 movq mm2, mm0 ; Copy mm0 to mm2
153 psrlq mm2, 32
154 paddusw mm0, mm2 ; Add
155 movq mm3, mm0
156 psrlq mm3, 16
157 paddusw mm0, mm3
158 movd eax, mm0 ; Store return value
159 and eax, 0xffff
160
161 pop edi
162 pop esi
163 pop edx
164 pop ecx
165 pop ebx
166
167 pop ebp ; restore stack pointer
168
169 ;emms ; clear mmx registers
170 ret ; return
171
172 global pix_abs16x16_sse
173
174 ; int pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
175 ; esi = p1 (init: blk1)
176 ; edi = p2 (init: blk2)
177 ; ecx = rowsleft (init: h)
178 ; edx = lx;
179
180 ; mm0 = distance accumulators (4 words)
181 ; mm1 = distance accumulators (4 words)
182 ; mm2 = temp
183 ; mm3 = temp
184 ; mm4 = temp
185 ; mm5 = temp
186 ; mm6 = temp
187 ; mm7 = temp
188
189
190 align 32
191 pix_abs16x16_sse:
192 push ebp ; save frame pointer
193 mov ebp, esp
194
195 push ebx ; Saves registers (called saves convention in
196 push ecx ; x86 GCC it seems)
197 push edx ;
198 push esi
199 push edi
200
201 pxor mm0, mm0 ; zero acculumators
202 pxor mm1, mm1
203 mov esi, [ebp+8] ; get pix1
204 mov edi, [ebp+12] ; get pix2
205 mov edx, [ebp+16] ; get lx
206 mov ecx, [ebp+20] ; get rowsleft
207 jmp .next4row
208 align 32
209
210 .next4row:
211 ; First row
212
213 movq mm4, [edi] ; load first 8 bytes of pix2 row
214 movq mm5, [edi+8] ; load last 8 bytes of pix2 row
215 psadbw mm4, [esi] ; SAD of first 8 bytes
216 psadbw mm5, [esi+8] ; SAD of last 8 bytes
217 paddw mm0, mm4 ; Add to acumulators
218 paddw mm1, mm5
219
220 ; Second row
221
222 add edi, edx;
223 add esi, edx;
224
225 movq mm6, [edi] ; load first 8 bytes of pix2 row
226 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
227 psadbw mm6, [esi] ; SAD of first 8 bytes
228 psadbw mm7, [esi+8] ; SAD of last 8 bytes
229 paddw mm0, mm6 ; Add to acumulators
230 paddw mm1, mm7
231
232 ; Third row
233
234 add edi, edx;
235 add esi, edx;
236
237 movq mm4, [edi] ; load first 8 bytes of pix2 row
238 movq mm5, [edi+8] ; load last 8 bytes of pix2 row
239 psadbw mm4, [esi] ; SAD of first 8 bytes
240 psadbw mm5, [esi+8] ; SAD of last 8 bytes
241 paddw mm0, mm4 ; Add to acumulators
242 paddw mm1, mm5
243
244 ; Fourth row
245
246 add edi, edx;
247 add esi, edx;
248
249 movq mm6, [edi] ; load first 8 bytes of pix2 row
250 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
251 psadbw mm6, [esi] ; SAD of first 8 bytes
252 psadbw mm7, [esi+8] ; SAD of last 8 bytes
253 paddw mm0, mm6 ; Add to acumulators
254 paddw mm1, mm7
255
256 ; Loop termination
257
258 add esi, edx ; update pointers to next row
259 add edi, edx
260 sub ecx,4
261 test ecx, ecx ; check rowsleft
262 jnz near .next4row
263
264 paddd mm0, mm1 ; Sum acumulators
265 movd eax, mm0 ; Store return value
266
267 pop edi
268 pop esi
269 pop edx
270 pop ecx
271 pop ebx
272
273 pop ebp ; restore stack pointer
274
275 ;emms ; clear mmx registers
276 ret ; return
277
278 global pix_abs16x16_x2_mmx
279
280 ; int pix_abs16x16_x2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
281 ; esi = p1 (init: blk1)
282 ; edi = p2 (init: blk2)
283 ; ecx = rowsleft (init: h)
284 ; edx = lx;
285
286 ; mm0 = distance accumulators (4 words)
287 ; mm1 = distance accumulators (4 words)
288 ; mm2 = temp
289 ; mm3 = temp
290 ; mm4 = temp
291 ; mm5 = temp
292 ; mm6 = 0
293 ; mm7 = temp
294
295
296 align 32
297 pix_abs16x16_x2_mmx:
298 push ebp ; save frame pointer
299 mov ebp, esp
300
301 push ebx ; Saves registers (called saves convention in
302 push ecx ; x86 GCC it seems)
303 push edx ;
304 push esi
305 push edi
306
307 pxor mm0, mm0 ; zero acculumators
308 pxor mm1, mm1
309 pxor mm6, mm6
310 mov esi, [ebp+8] ; get pix1
311 mov edi, [ebp+12] ; get pix2
312 mov edx, [ebp+16] ; get lx
313 mov ecx, [ebp+20] ; get rowsleft
314 jmp .nextrow_x2
315 align 32
316
317 .nextrow_x2:
318 ; First 8 bytes of the row
319
320 movq mm4, [edi] ; load first 8 bytes of pix2 row
321 movq mm5, [edi+1] ; load bytes 1-8 of pix2 row
322
323 movq mm2, mm4 ; copy mm4 on mm2
324 movq mm3, mm5 ; copy mm5 on mm3
325 punpcklbw mm4, mm6 ; first 4 bytes of [edi] on mm4
326 punpcklbw mm5, mm6 ; first 4 bytes of [edi+1] on mm5
327 paddusw mm4, mm5 ; mm4 := first 4 bytes interpolated in words
328 psrlw mm4, 1
329
330 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
331 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
332 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
333 psrlw mm2, 1
334
335 packuswb mm4, mm2 ; pack 8 bytes interpolated on mm4
336 movq mm5,[esi] ; load first 8 bytes of pix1 row
337
338 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
339 psubusb mm4, mm5
340 psubusb mm5, mm3
341 por mm4, mm5
342
343 ; Last 8 bytes of the row
344
345 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
346 movq mm5, [edi+9] ; load bytes 10-17 of pix2 row
347
348 movq mm2, mm7 ; copy mm7 on mm2
349 movq mm3, mm5 ; copy mm5 on mm3
350 punpcklbw mm7, mm6 ; first 4 bytes of [edi+8] on mm7
351 punpcklbw mm5, mm6 ; first 4 bytes of [edi+9] on mm5
352 paddusw mm7, mm5 ; mm1 := first 4 bytes interpolated in words
353 psrlw mm7, 1
354
355 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
356 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
357 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
358 psrlw mm2, 1
359
360 packuswb mm7, mm2 ; pack 8 bytes interpolated on mm1
361 movq mm5,[esi+8] ; load last 8 bytes of pix1 row
362
363 movq mm3, mm7 ; mm7 := abs(mm1-mm5)
364 psubusb mm7, mm5
365 psubusb mm5, mm3
366 por mm7, mm5
367
368 ; Now mm4 and mm7 have 16 absdiffs to add
369
370 movq mm3, mm4 ; Make copies of these bytes
371 movq mm2, mm7
372
373 punpcklbw mm4, mm6 ; Unpack to words and add
374 punpcklbw mm7, mm6
375 paddusw mm4, mm7
376 paddusw mm0, mm4 ; Add to the acumulator (mm0)
377
378 punpckhbw mm3, mm6 ; Unpack to words and add
379 punpckhbw mm2, mm6
380 paddusw mm3, mm2
381 paddusw mm1, mm3 ; Add to the acumulator (mm1)
382
383 ; Loop termination
384
385 add esi, edx ; update pointers to next row
386 add edi, edx
387
388 sub ecx,1
389 test ecx, ecx ; check rowsleft
390 jnz near .nextrow_x2
391
392 paddusw mm0, mm1
393
394 movq mm1, mm0 ; Copy mm0 to mm1
395 psrlq mm1, 32
396 paddusw mm0, mm1 ; Add
397 movq mm2, mm0
398 psrlq mm2, 16
399 paddusw mm0, mm2
400 movd eax, mm0 ; Store return value
401 and eax, 0xffff
402
403 pop edi
404 pop esi
405 pop edx
406 pop ecx
407 pop ebx
408
409 pop ebp ; restore stack pointer
410
411 emms ; clear mmx registers
412 ret ; return
413
414 global pix_abs16x16_y2_mmx
415
416 ; int pix_abs16x16_y2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
417 ; esi = p1 (init: blk1)
418 ; edi = p2 (init: blk2)
419 ; ebx = p2 + lx
420 ; ecx = rowsleft (init: h)
421 ; edx = lx;
422
423 ; mm0 = distance accumulators (4 words)
424 ; mm1 = distance accumulators (4 words)
425 ; mm2 = temp
426 ; mm3 = temp
427 ; mm4 = temp
428 ; mm5 = temp
429 ; mm6 = 0
430 ; mm7 = temp
431
432
433 align 32
434 pix_abs16x16_y2_mmx:
435 push ebp ; save frame pointer
436 mov ebp, esp
437
438 push ebx ; Saves registers (called saves convention in
439 push ecx ; x86 GCC it seems)
440 push edx ;
441 push esi
442 push edi
443
444 pxor mm0, mm0 ; zero acculumators
445 pxor mm1, mm1
446 pxor mm6, mm6
447 mov esi, [ebp+8] ; get pix1
448 mov edi, [ebp+12] ; get pix2
449 mov edx, [ebp+16] ; get lx
450 mov ecx, [ebp+20] ; get rowsleft
451 mov ebx, edi
452 add ebx, edx
453 jmp .nextrow_y2
454 align 32
455
456 .nextrow_y2:
457 ; First 8 bytes of the row
458
459 movq mm4, [edi] ; load first 8 bytes of pix2 row
460 movq mm5, [ebx] ; load bytes 1-8 of pix2 row
461
462 movq mm2, mm4 ; copy mm4 on mm2
463 movq mm3, mm5 ; copy mm5 on mm3
464 punpcklbw mm4, mm6 ; first 4 bytes of [edi] on mm4
465 punpcklbw mm5, mm6 ; first 4 bytes of [ebx] on mm5
466 paddusw mm4, mm5 ; mm4 := first 4 bytes interpolated in words
467 psrlw mm4, 1
468
469 punpckhbw mm2, mm6 ; last 4 bytes of [edi] on mm2
470 punpckhbw mm3, mm6 ; last 4 bytes of [edi+1] on mm3
471 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
472 psrlw mm2, 1
473
474 packuswb mm4, mm2 ; pack 8 bytes interpolated on mm4
475 movq mm5,[esi] ; load first 8 bytes of pix1 row
476
477 movq mm3, mm4 ; mm4 := abs(mm4-mm5)
478 psubusb mm4, mm5
479 psubusb mm5, mm3
480 por mm4, mm5
481
482 ; Last 8 bytes of the row
483
484 movq mm7, [edi+8] ; load last 8 bytes of pix2 row
485 movq mm5, [ebx+8] ; load bytes 10-17 of pix2 row
486
487 movq mm2, mm7 ; copy mm7 on mm2
488 movq mm3, mm5 ; copy mm5 on mm3
489 punpcklbw mm7, mm6 ; first 4 bytes of [edi+8] on mm7
490 punpcklbw mm5, mm6 ; first 4 bytes of [ebx+8] on mm5
491 paddusw mm7, mm5 ; mm1 := first 4 bytes interpolated in words
492 psrlw mm7, 1
493
494 punpckhbw mm2, mm6 ; last 4 bytes of [edi+8] on mm2
495 punpckhbw mm3, mm6 ; last 4 bytes of [ebx+8] on mm3
496 paddusw mm2, mm3 ; mm2 := last 4 bytes interpolated in words
497 psrlw mm2, 1
498
499 packuswb mm7, mm2 ; pack 8 bytes interpolated on mm1
500 movq mm5,[esi+8] ; load last 8 bytes of pix1 row
501
502 movq mm3, mm7 ; mm7 := abs(mm1-mm5)
503 psubusb mm7, mm5
504 psubusb mm5, mm3
505 por mm7, mm5
506
507 ; Now mm4 and mm7 have 16 absdiffs to add
508
509 movq mm3, mm4 ; Make copies of these bytes
510 movq mm2, mm7
511
512 punpcklbw mm4, mm6 ; Unpack to words and add
513 punpcklbw mm7, mm6
514 paddusw mm4, mm7
515 paddusw mm0, mm4 ; Add to the acumulator (mm0)
516
517 punpckhbw mm3, mm6 ; Unpack to words and add
518 punpckhbw mm2, mm6
519 paddusw mm3, mm2
520 paddusw mm1, mm3 ; Add to the acumulator (mm1)
521
522 ; Loop termination
523
524 add esi, edx ; update pointers to next row
525 add edi, edx
526 add ebx, edx
527 sub ecx,1
528 test ecx, ecx ; check rowsleft
529 jnz near .nextrow_y2
530
531 paddusw mm0, mm1
532
533 movq mm1, mm0 ; Copy mm0 to mm1
534 psrlq mm1, 32
535 paddusw mm0, mm1 ; Add
536 movq mm2, mm0
537 psrlq mm2, 16
538 paddusw mm0, mm2
539 movd eax, mm0 ; Store return value
540 and eax, 0xffff
541
542 pop edi
543 pop esi
544 pop edx
545 pop ecx
546 pop ebx
547
548 pop ebp ; restore stack pointer
549
550 emms ; clear mmx registers
551 ret ; return
552
553 global pix_abs16x16_xy2_mmx
554
555 ; int pix_abs16x16_xy2_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
556
557 ; esi = p1 (init: blk1)
558 ; edi = p2 (init: blk2)
559 ; ebx = p1+lx
560 ; ecx = rowsleft (init: h)
561 ; edx = lx;
562
563 ; mm0 = distance accumulators (4 words)
564 ; mm1 = bytes p2
565 ; mm2 = bytes p1
566 ; mm3 = bytes p1+lx
567 ; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
568 ; but I don't think thats going to happen in iA32-land...
569 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
570 ; mm5 = temp 4 bytes in words from p2
571 ; mm6 = temp comparison bit mask p1,p2
572 ; mm7 = temp comparison bit mask p2,p1
573
574
575 align 32
576 pix_abs16x16_xy2_mmx:
577 push ebp ; save stack pointer
578 mov ebp, esp ; so that we can do this
579
580 push ebx ; Saves registers (called saves convention in
581 push ecx ; x86 GCC it seems)
582 push edx ;
583 push esi
584 push edi
585
586 pxor mm0, mm0 ; zero acculumators
587
588 mov esi, [ebp+12] ; get p1
589 mov edi, [ebp+8] ; get p2
590 mov edx, [ebp+16] ; get lx
591 mov ecx, [ebp+20] ; rowsleft := h
592 mov ebx, esi
593 add ebx, edx
594 jmp .nextrowmm11 ; snap to it
595 align 32
596 .nextrowmm11:
597
598 ;;
599 ;; First 8 bytes of row
600 ;;
601
602 ;; First 4 bytes of 8
603
604 movq mm4, [esi] ; mm4 := first 4 bytes p1
605 pxor mm7, mm7
606 movq mm2, mm4 ; mm2 records all 8 bytes
607 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
608
609 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
610 movq mm3, mm6 ; mm3 records all 8 bytes
611 punpcklbw mm6, mm7
612 paddw mm4, mm6
613
614
615 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
616 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
617 paddw mm4, mm5
618 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
619 punpcklbw mm6, mm7
620 paddw mm4, mm6
621
622 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
623
624 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
625 movq mm1, mm5
626 punpcklbw mm5, mm7
627
628 movq mm7,mm4
629 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
630
631 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
632 psubw mm6,mm5
633 pand mm6, mm7
634
635 paddw mm0, mm6 ; Add to accumulator
636
637 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
638 pcmpgtw mm6,mm4
639 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
640 pand mm5, mm6
641
642 paddw mm0, mm5 ; Add to accumulator
643
644 ;; Second 4 bytes of 8
645
646 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
647 pxor mm7, mm7
648 punpckhbw mm4, mm7
649 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
650 punpckhbw mm6, mm7
651 paddw mm4, mm6
652
653 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
654 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
655 paddw mm4, mm5
656 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
657 punpckhbw mm6, mm7
658 paddw mm4, mm6
659
660 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
661
662 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
663 punpckhbw mm5, mm7
664
665 movq mm7,mm4
666 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
667
668 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
669 psubw mm6,mm5
670 pand mm6, mm7
671
672 paddw mm0, mm6 ; Add to accumulator
673
674 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
675 pcmpgtw mm6,mm4
676 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
677 pand mm5, mm6
678
679 paddw mm0, mm5 ; Add to accumulator
680
681
682 ;;
683 ;; Second 8 bytes of row
684 ;;
685 ;; First 4 bytes of 8
686
687 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
688 pxor mm7, mm7
689 movq mm2, mm4 ; mm2 records all 8 bytes
690 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
691
692 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
693 movq mm3, mm6 ; mm3 records all 8 bytes
694 punpcklbw mm6, mm7
695 paddw mm4, mm6
696
697
698 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
699 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
700 paddw mm4, mm5
701 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
702 punpcklbw mm6, mm7
703 paddw mm4, mm6
704
705 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
706
707 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
708 movq mm1, mm5
709 punpcklbw mm5, mm7
710
711 movq mm7,mm4
712 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
713
714 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
715 psubw mm6,mm5
716 pand mm6, mm7
717
718 paddw mm0, mm6 ; Add to accumulator
719
720 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
721 pcmpgtw mm6,mm4
722 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
723 pand mm5, mm6
724
725 paddw mm0, mm5 ; Add to accumulator
726
727 ;; Second 4 bytes of 8
728
729 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
730 pxor mm7, mm7
731 punpckhbw mm4, mm7
732 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
733 punpckhbw mm6, mm7
734 paddw mm4, mm6
735
736 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
737 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
738 paddw mm4, mm5
739 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
740 punpckhbw mm6, mm7
741 paddw mm4, mm6
742
743 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
744
745 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
746 punpckhbw mm5, mm7
747
748 movq mm7,mm4
749 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
750
751 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
752 psubw mm6,mm5
753 pand mm6, mm7
754
755 paddw mm0, mm6 ; Add to accumulator
756
757 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
758 pcmpgtw mm6,mm4
759 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
760 pand mm5, mm6
761
762 paddw mm0, mm5 ; Add to accumulator
763
764
765 ;;
766 ;; Loop termination condition... and stepping
767 ;;
768
769 add esi, edx ; update pointer to next row
770 add edi, edx ; ditto
771 add ebx, edx
772
773 sub ecx,1
774 test ecx, ecx ; check rowsleft
775 jnz near .nextrowmm11
776
777 ;; Sum the Accumulators
778 movq mm4, mm0
779 psrlq mm4, 32
780 paddw mm0, mm4
781 movq mm6, mm0
782 psrlq mm6, 16
783 paddw mm0, mm6
784 movd eax, mm0 ; store return value
785 and eax, 0xffff
786
787 pop edi
788 pop esi
789 pop edx
790 pop ecx
791 pop ebx
792
793 pop ebp ; restore stack pointer
794
795 emms ; clear mmx registers
796 ret ; we now return you to your regular programming
797
798