source: liacs/MIR2010/SourceCode/cximage/png/pngvcrd.c@ 404

Last change on this file since 404 was 95, checked in by Rick van der Zwet, 15 years ago

Bad boy, improper move of directory

File size: 143.9 KB
RevLine 
[95]1
2/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 *
4 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 *
6 * Last changed in libpng 1.2.6 - August 15, 2004
7 * For conditions of distribution and use, see copyright notice in png.h
8 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
9 * Copyright (c) 1998, Intel Corporation
10 *
11 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
12 * Interface to libpng contributed by Gilles Vollant, 1999
13 *
14 *
15 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
16 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
17 * in bad pixels at the beginning of some rows of some images, and also
18 * (due to out-of-range memory reads and writes) caused heap corruption
19 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 *
21 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 *
23 * [runtime MMX configuration, GRR 20010102]
24 *
25 */
26
27#define PNG_INTERNAL
28#include "png.h"
29
30#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31
32static int mmx_supported=2;
33
34
35int PNGAPI
36png_mmx_support(void)
37{
38 int mmx_supported_local = 0;
39 _asm {
40 push ebx //CPUID will trash these
41 push ecx
42 push edx
43
44 pushfd //Save Eflag to stack
45 pop eax //Get Eflag from stack into eax
46 mov ecx, eax //Make another copy of Eflag in ecx
47 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
48 push eax //Save modified Eflag back to stack
49
50 popfd //Restored modified value back to Eflag reg
51 pushfd //Save Eflag to stack
52 pop eax //Get Eflag from stack
53 push ecx // save original Eflag to stack
54 popfd // restore original Eflag
55 xor eax, ecx //Compare the new Eflag with the original Eflag
56 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
57 //skip following instructions and jump to
58 //NOT_SUPPORTED label
59
60 xor eax, eax //Set eax to zero
61
62 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
63 _asm _emit 0xa2
64
65 cmp eax, 1 //make sure eax return non-zero value
66 jl NOT_SUPPORTED //If eax is zero, mmx not supported
67
68 xor eax, eax //set eax to zero
69 inc eax //Now increment eax to 1. This instruction is
70 //faster than the instruction "mov eax, 1"
71
72 _asm _emit 0x0f //CPUID instruction
73 _asm _emit 0xa2
74
75 and edx, 0x00800000 //mask out all bits but mmx bit(24)
76 cmp edx, 0 // 0 = mmx not supported
77 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
78
79 mov mmx_supported_local, 1 //set return value to 1
80
81NOT_SUPPORTED:
82 mov eax, mmx_supported_local //move return value to eax
83 pop edx //CPUID trashed these
84 pop ecx
85 pop ebx
86 }
87
88 //mmx_supported_local=0; // test code for force don't support MMX
89 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
90
91 mmx_supported = mmx_supported_local;
92 return mmx_supported_local;
93}
94
95/* Combines the row recently read in with the previous row.
96 This routine takes care of alpha and transparency if requested.
97 This routine also handles the two methods of progressive display
98 of interlaced images, depending on the mask value.
99 The mask value describes which pixels are to be combined with
100 the row. The pattern always repeats every 8 pixels, so just 8
101 bits are needed. A one indicates the pixel is to be combined; a
102 zero indicates the pixel is to be skipped. This is in addition
103 to any alpha or transparency value associated with the pixel. If
104 you want all pixels to be combined, pass 0xff (255) in mask. */
105
106/* Use this routine for x86 platform - uses faster MMX routine if machine
107 supports MMX */
108
109void /* PRIVATE */
110png_combine_row(png_structp png_ptr, png_bytep row, int mask)
111{
112#ifdef PNG_USE_LOCAL_ARRAYS
113 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
114#endif
115
116 png_debug(1,"in png_combine_row_asm\n");
117
118 if (mmx_supported == 2) {
119#if !defined(PNG_1_0_X)
120 /* this should have happened in png_init_mmx_flags() already */
121 png_warning(png_ptr, "asm_flags may not have been initialized");
122#endif
123 png_mmx_support();
124 }
125
126 if (mask == 0xff)
127 {
128 png_memcpy(row, png_ptr->row_buf + 1,
129 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
130 png_ptr->width));
131 }
132 /* GRR: add "else if (mask == 0)" case?
133 * or does png_combine_row() not even get called in that case? */
134 else
135 {
136 switch (png_ptr->row_info.pixel_depth)
137 {
138 case 1:
139 {
140 png_bytep sp;
141 png_bytep dp;
142 int s_inc, s_start, s_end;
143 int m;
144 int shift;
145 png_uint_32 i;
146
147 sp = png_ptr->row_buf + 1;
148 dp = row;
149 m = 0x80;
150#if defined(PNG_READ_PACKSWAP_SUPPORTED)
151 if (png_ptr->transformations & PNG_PACKSWAP)
152 {
153 s_start = 0;
154 s_end = 7;
155 s_inc = 1;
156 }
157 else
158#endif
159 {
160 s_start = 7;
161 s_end = 0;
162 s_inc = -1;
163 }
164
165 shift = s_start;
166
167 for (i = 0; i < png_ptr->width; i++)
168 {
169 if (m & mask)
170 {
171 int value;
172
173 value = (*sp >> shift) & 0x1;
174 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
175 *dp |= (png_byte)(value << shift);
176 }
177
178 if (shift == s_end)
179 {
180 shift = s_start;
181 sp++;
182 dp++;
183 }
184 else
185 shift += s_inc;
186
187 if (m == 1)
188 m = 0x80;
189 else
190 m >>= 1;
191 }
192 break;
193 }
194
195 case 2:
196 {
197 png_bytep sp;
198 png_bytep dp;
199 int s_start, s_end, s_inc;
200 int m;
201 int shift;
202 png_uint_32 i;
203 int value;
204
205 sp = png_ptr->row_buf + 1;
206 dp = row;
207 m = 0x80;
208#if defined(PNG_READ_PACKSWAP_SUPPORTED)
209 if (png_ptr->transformations & PNG_PACKSWAP)
210 {
211 s_start = 0;
212 s_end = 6;
213 s_inc = 2;
214 }
215 else
216#endif
217 {
218 s_start = 6;
219 s_end = 0;
220 s_inc = -2;
221 }
222
223 shift = s_start;
224
225 for (i = 0; i < png_ptr->width; i++)
226 {
227 if (m & mask)
228 {
229 value = (*sp >> shift) & 0x3;
230 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
231 *dp |= (png_byte)(value << shift);
232 }
233
234 if (shift == s_end)
235 {
236 shift = s_start;
237 sp++;
238 dp++;
239 }
240 else
241 shift += s_inc;
242 if (m == 1)
243 m = 0x80;
244 else
245 m >>= 1;
246 }
247 break;
248 }
249
250 case 4:
251 {
252 png_bytep sp;
253 png_bytep dp;
254 int s_start, s_end, s_inc;
255 int m;
256 int shift;
257 png_uint_32 i;
258 int value;
259
260 sp = png_ptr->row_buf + 1;
261 dp = row;
262 m = 0x80;
263#if defined(PNG_READ_PACKSWAP_SUPPORTED)
264 if (png_ptr->transformations & PNG_PACKSWAP)
265 {
266 s_start = 0;
267 s_end = 4;
268 s_inc = 4;
269 }
270 else
271#endif
272 {
273 s_start = 4;
274 s_end = 0;
275 s_inc = -4;
276 }
277 shift = s_start;
278
279 for (i = 0; i < png_ptr->width; i++)
280 {
281 if (m & mask)
282 {
283 value = (*sp >> shift) & 0xf;
284 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
285 *dp |= (png_byte)(value << shift);
286 }
287
288 if (shift == s_end)
289 {
290 shift = s_start;
291 sp++;
292 dp++;
293 }
294 else
295 shift += s_inc;
296 if (m == 1)
297 m = 0x80;
298 else
299 m >>= 1;
300 }
301 break;
302 }
303
304 case 8:
305 {
306 png_bytep srcptr;
307 png_bytep dstptr;
308 png_uint_32 len;
309 int m;
310 int diff, unmask;
311
312 __int64 mask0=0x0102040810204080;
313
314#if !defined(PNG_1_0_X)
315 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
316 /* && mmx_supported */ )
317#else
318 if (mmx_supported)
319#endif
320 {
321 srcptr = png_ptr->row_buf + 1;
322 dstptr = row;
323 m = 0x80;
324 unmask = ~mask;
325 len = png_ptr->width &~7; //reduce to multiple of 8
326 diff = png_ptr->width & 7; //amount lost
327
328 _asm
329 {
330 movd mm7, unmask //load bit pattern
331 psubb mm6,mm6 //zero mm6
332 punpcklbw mm7,mm7
333 punpcklwd mm7,mm7
334 punpckldq mm7,mm7 //fill register with 8 masks
335
336 movq mm0,mask0
337
338 pand mm0,mm7 //nonzero if keep byte
339 pcmpeqb mm0,mm6 //zeros->1s, v versa
340
341 mov ecx,len //load length of line (pixels)
342 mov esi,srcptr //load source
343 mov ebx,dstptr //load dest
344 cmp ecx,0 //lcr
345 je mainloop8end
346
347mainloop8:
348 movq mm4,[esi]
349 pand mm4,mm0
350 movq mm6,mm0
351 pandn mm6,[ebx]
352 por mm4,mm6
353 movq [ebx],mm4
354
355 add esi,8 //inc by 8 bytes processed
356 add ebx,8
357 sub ecx,8 //dec by 8 pixels processed
358
359 ja mainloop8
360mainloop8end:
361
362 mov ecx,diff
363 cmp ecx,0
364 jz end8
365
366 mov edx,mask
367 sal edx,24 //make low byte the high byte
368
369secondloop8:
370 sal edx,1 //move high bit to CF
371 jnc skip8 //if CF = 0
372 mov al,[esi]
373 mov [ebx],al
374skip8:
375 inc esi
376 inc ebx
377
378 dec ecx
379 jnz secondloop8
380end8:
381 emms
382 }
383 }
384 else /* mmx not supported - use modified C routine */
385 {
386 register unsigned int incr1, initial_val, final_val;
387 png_size_t pixel_bytes;
388 png_uint_32 i;
389 register int disp = png_pass_inc[png_ptr->pass];
390 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
391
392 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
393 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
394 pixel_bytes;
395 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
396 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
397 final_val = png_ptr->width*pixel_bytes;
398 incr1 = (disp)*pixel_bytes;
399 for (i = initial_val; i < final_val; i += incr1)
400 {
401 png_memcpy(dstptr, srcptr, pixel_bytes);
402 srcptr += incr1;
403 dstptr += incr1;
404 }
405 } /* end of else */
406
407 break;
408 } // end 8 bpp
409
410 case 16:
411 {
412 png_bytep srcptr;
413 png_bytep dstptr;
414 png_uint_32 len;
415 int unmask, diff;
416 __int64 mask1=0x0101020204040808,
417 mask0=0x1010202040408080;
418
419#if !defined(PNG_1_0_X)
420 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
421 /* && mmx_supported */ )
422#else
423 if (mmx_supported)
424#endif
425 {
426 srcptr = png_ptr->row_buf + 1;
427 dstptr = row;
428
429 unmask = ~mask;
430 len = (png_ptr->width)&~7;
431 diff = (png_ptr->width)&7;
432 _asm
433 {
434 movd mm7, unmask //load bit pattern
435 psubb mm6,mm6 //zero mm6
436 punpcklbw mm7,mm7
437 punpcklwd mm7,mm7
438 punpckldq mm7,mm7 //fill register with 8 masks
439
440 movq mm0,mask0
441 movq mm1,mask1
442
443 pand mm0,mm7
444 pand mm1,mm7
445
446 pcmpeqb mm0,mm6
447 pcmpeqb mm1,mm6
448
449 mov ecx,len //load length of line
450 mov esi,srcptr //load source
451 mov ebx,dstptr //load dest
452 cmp ecx,0 //lcr
453 jz mainloop16end
454
455mainloop16:
456 movq mm4,[esi]
457 pand mm4,mm0
458 movq mm6,mm0
459 movq mm7,[ebx]
460 pandn mm6,mm7
461 por mm4,mm6
462 movq [ebx],mm4
463
464 movq mm5,[esi+8]
465 pand mm5,mm1
466 movq mm7,mm1
467 movq mm6,[ebx+8]
468 pandn mm7,mm6
469 por mm5,mm7
470 movq [ebx+8],mm5
471
472 add esi,16 //inc by 16 bytes processed
473 add ebx,16
474 sub ecx,8 //dec by 8 pixels processed
475
476 ja mainloop16
477
478mainloop16end:
479 mov ecx,diff
480 cmp ecx,0
481 jz end16
482
483 mov edx,mask
484 sal edx,24 //make low byte the high byte
485secondloop16:
486 sal edx,1 //move high bit to CF
487 jnc skip16 //if CF = 0
488 mov ax,[esi]
489 mov [ebx],ax
490skip16:
491 add esi,2
492 add ebx,2
493
494 dec ecx
495 jnz secondloop16
496end16:
497 emms
498 }
499 }
500 else /* mmx not supported - use modified C routine */
501 {
502 register unsigned int incr1, initial_val, final_val;
503 png_size_t pixel_bytes;
504 png_uint_32 i;
505 register int disp = png_pass_inc[png_ptr->pass];
506 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
507
508 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
509 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
510 pixel_bytes;
511 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
512 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
513 final_val = png_ptr->width*pixel_bytes;
514 incr1 = (disp)*pixel_bytes;
515 for (i = initial_val; i < final_val; i += incr1)
516 {
517 png_memcpy(dstptr, srcptr, pixel_bytes);
518 srcptr += incr1;
519 dstptr += incr1;
520 }
521 } /* end of else */
522
523 break;
524 } // end 16 bpp
525
526 case 24:
527 {
528 png_bytep srcptr;
529 png_bytep dstptr;
530 png_uint_32 len;
531 int unmask, diff;
532
533 __int64 mask2=0x0101010202020404, //24bpp
534 mask1=0x0408080810101020,
535 mask0=0x2020404040808080;
536
537 srcptr = png_ptr->row_buf + 1;
538 dstptr = row;
539
540 unmask = ~mask;
541 len = (png_ptr->width)&~7;
542 diff = (png_ptr->width)&7;
543
544#if !defined(PNG_1_0_X)
545 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
546 /* && mmx_supported */ )
547#else
548 if (mmx_supported)
549#endif
550 {
551 _asm
552 {
553 movd mm7, unmask //load bit pattern
554 psubb mm6,mm6 //zero mm6
555 punpcklbw mm7,mm7
556 punpcklwd mm7,mm7
557 punpckldq mm7,mm7 //fill register with 8 masks
558
559 movq mm0,mask0
560 movq mm1,mask1
561 movq mm2,mask2
562
563 pand mm0,mm7
564 pand mm1,mm7
565 pand mm2,mm7
566
567 pcmpeqb mm0,mm6
568 pcmpeqb mm1,mm6
569 pcmpeqb mm2,mm6
570
571 mov ecx,len //load length of line
572 mov esi,srcptr //load source
573 mov ebx,dstptr //load dest
574 cmp ecx,0
575 jz mainloop24end
576
577mainloop24:
578 movq mm4,[esi]
579 pand mm4,mm0
580 movq mm6,mm0
581 movq mm7,[ebx]
582 pandn mm6,mm7
583 por mm4,mm6
584 movq [ebx],mm4
585
586
587 movq mm5,[esi+8]
588 pand mm5,mm1
589 movq mm7,mm1
590 movq mm6,[ebx+8]
591 pandn mm7,mm6
592 por mm5,mm7
593 movq [ebx+8],mm5
594
595 movq mm6,[esi+16]
596 pand mm6,mm2
597 movq mm4,mm2
598 movq mm7,[ebx+16]
599 pandn mm4,mm7
600 por mm6,mm4
601 movq [ebx+16],mm6
602
603 add esi,24 //inc by 24 bytes processed
604 add ebx,24
605 sub ecx,8 //dec by 8 pixels processed
606
607 ja mainloop24
608
609mainloop24end:
610 mov ecx,diff
611 cmp ecx,0
612 jz end24
613
614 mov edx,mask
615 sal edx,24 //make low byte the high byte
616secondloop24:
617 sal edx,1 //move high bit to CF
618 jnc skip24 //if CF = 0
619 mov ax,[esi]
620 mov [ebx],ax
621 xor eax,eax
622 mov al,[esi+2]
623 mov [ebx+2],al
624skip24:
625 add esi,3
626 add ebx,3
627
628 dec ecx
629 jnz secondloop24
630
631end24:
632 emms
633 }
634 }
635 else /* mmx not supported - use modified C routine */
636 {
637 register unsigned int incr1, initial_val, final_val;
638 png_size_t pixel_bytes;
639 png_uint_32 i;
640 register int disp = png_pass_inc[png_ptr->pass];
641 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
642
643 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
644 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
645 pixel_bytes;
646 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
647 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
648 final_val = png_ptr->width*pixel_bytes;
649 incr1 = (disp)*pixel_bytes;
650 for (i = initial_val; i < final_val; i += incr1)
651 {
652 png_memcpy(dstptr, srcptr, pixel_bytes);
653 srcptr += incr1;
654 dstptr += incr1;
655 }
656 } /* end of else */
657
658 break;
659 } // end 24 bpp
660
661 case 32:
662 {
663 png_bytep srcptr;
664 png_bytep dstptr;
665 png_uint_32 len;
666 int unmask, diff;
667
668 __int64 mask3=0x0101010102020202, //32bpp
669 mask2=0x0404040408080808,
670 mask1=0x1010101020202020,
671 mask0=0x4040404080808080;
672
673 srcptr = png_ptr->row_buf + 1;
674 dstptr = row;
675
676 unmask = ~mask;
677 len = (png_ptr->width)&~7;
678 diff = (png_ptr->width)&7;
679
680#if !defined(PNG_1_0_X)
681 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
682 /* && mmx_supported */ )
683#else
684 if (mmx_supported)
685#endif
686 {
687 _asm
688 {
689 movd mm7, unmask //load bit pattern
690 psubb mm6,mm6 //zero mm6
691 punpcklbw mm7,mm7
692 punpcklwd mm7,mm7
693 punpckldq mm7,mm7 //fill register with 8 masks
694
695 movq mm0,mask0
696 movq mm1,mask1
697 movq mm2,mask2
698 movq mm3,mask3
699
700 pand mm0,mm7
701 pand mm1,mm7
702 pand mm2,mm7
703 pand mm3,mm7
704
705 pcmpeqb mm0,mm6
706 pcmpeqb mm1,mm6
707 pcmpeqb mm2,mm6
708 pcmpeqb mm3,mm6
709
710 mov ecx,len //load length of line
711 mov esi,srcptr //load source
712 mov ebx,dstptr //load dest
713
714 cmp ecx,0 //lcr
715 jz mainloop32end
716
717mainloop32:
718 movq mm4,[esi]
719 pand mm4,mm0
720 movq mm6,mm0
721 movq mm7,[ebx]
722 pandn mm6,mm7
723 por mm4,mm6
724 movq [ebx],mm4
725
726 movq mm5,[esi+8]
727 pand mm5,mm1
728 movq mm7,mm1
729 movq mm6,[ebx+8]
730 pandn mm7,mm6
731 por mm5,mm7
732 movq [ebx+8],mm5
733
734 movq mm6,[esi+16]
735 pand mm6,mm2
736 movq mm4,mm2
737 movq mm7,[ebx+16]
738 pandn mm4,mm7
739 por mm6,mm4
740 movq [ebx+16],mm6
741
742 movq mm7,[esi+24]
743 pand mm7,mm3
744 movq mm5,mm3
745 movq mm4,[ebx+24]
746 pandn mm5,mm4
747 por mm7,mm5
748 movq [ebx+24],mm7
749
750 add esi,32 //inc by 32 bytes processed
751 add ebx,32
752 sub ecx,8 //dec by 8 pixels processed
753
754 ja mainloop32
755
756mainloop32end:
757 mov ecx,diff
758 cmp ecx,0
759 jz end32
760
761 mov edx,mask
762 sal edx,24 //make low byte the high byte
763secondloop32:
764 sal edx,1 //move high bit to CF
765 jnc skip32 //if CF = 0
766 mov eax,[esi]
767 mov [ebx],eax
768skip32:
769 add esi,4
770 add ebx,4
771
772 dec ecx
773 jnz secondloop32
774
775end32:
776 emms
777 }
778 }
779 else /* mmx _not supported - Use modified C routine */
780 {
781 register unsigned int incr1, initial_val, final_val;
782 png_size_t pixel_bytes;
783 png_uint_32 i;
784 register int disp = png_pass_inc[png_ptr->pass];
785 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
786
787 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
788 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
789 pixel_bytes;
790 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
791 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
792 final_val = png_ptr->width*pixel_bytes;
793 incr1 = (disp)*pixel_bytes;
794 for (i = initial_val; i < final_val; i += incr1)
795 {
796 png_memcpy(dstptr, srcptr, pixel_bytes);
797 srcptr += incr1;
798 dstptr += incr1;
799 }
800 } /* end of else */
801
802 break;
803 } // end 32 bpp
804
805 case 48:
806 {
807 png_bytep srcptr;
808 png_bytep dstptr;
809 png_uint_32 len;
810 int unmask, diff;
811
812 __int64 mask5=0x0101010101010202,
813 mask4=0x0202020204040404,
814 mask3=0x0404080808080808,
815 mask2=0x1010101010102020,
816 mask1=0x2020202040404040,
817 mask0=0x4040808080808080;
818
819#if !defined(PNG_1_0_X)
820 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
821 /* && mmx_supported */ )
822#else
823 if (mmx_supported)
824#endif
825 {
826 srcptr = png_ptr->row_buf + 1;
827 dstptr = row;
828
829 unmask = ~mask;
830 len = (png_ptr->width)&~7;
831 diff = (png_ptr->width)&7;
832 _asm
833 {
834 movd mm7, unmask //load bit pattern
835 psubb mm6,mm6 //zero mm6
836 punpcklbw mm7,mm7
837 punpcklwd mm7,mm7
838 punpckldq mm7,mm7 //fill register with 8 masks
839
840 movq mm0,mask0
841 movq mm1,mask1
842 movq mm2,mask2
843 movq mm3,mask3
844 movq mm4,mask4
845 movq mm5,mask5
846
847 pand mm0,mm7
848 pand mm1,mm7
849 pand mm2,mm7
850 pand mm3,mm7
851 pand mm4,mm7
852 pand mm5,mm7
853
854 pcmpeqb mm0,mm6
855 pcmpeqb mm1,mm6
856 pcmpeqb mm2,mm6
857 pcmpeqb mm3,mm6
858 pcmpeqb mm4,mm6
859 pcmpeqb mm5,mm6
860
861 mov ecx,len //load length of line
862 mov esi,srcptr //load source
863 mov ebx,dstptr //load dest
864
865 cmp ecx,0
866 jz mainloop48end
867
868mainloop48:
869 movq mm7,[esi]
870 pand mm7,mm0
871 movq mm6,mm0
872 pandn mm6,[ebx]
873 por mm7,mm6
874 movq [ebx],mm7
875
876 movq mm6,[esi+8]
877 pand mm6,mm1
878 movq mm7,mm1
879 pandn mm7,[ebx+8]
880 por mm6,mm7
881 movq [ebx+8],mm6
882
883 movq mm6,[esi+16]
884 pand mm6,mm2
885 movq mm7,mm2
886 pandn mm7,[ebx+16]
887 por mm6,mm7
888 movq [ebx+16],mm6
889
890 movq mm7,[esi+24]
891 pand mm7,mm3
892 movq mm6,mm3
893 pandn mm6,[ebx+24]
894 por mm7,mm6
895 movq [ebx+24],mm7
896
897 movq mm6,[esi+32]
898 pand mm6,mm4
899 movq mm7,mm4
900 pandn mm7,[ebx+32]
901 por mm6,mm7
902 movq [ebx+32],mm6
903
904 movq mm7,[esi+40]
905 pand mm7,mm5
906 movq mm6,mm5
907 pandn mm6,[ebx+40]
908 por mm7,mm6
909 movq [ebx+40],mm7
910
911 add esi,48 //inc by 32 bytes processed
912 add ebx,48
913 sub ecx,8 //dec by 8 pixels processed
914
915 ja mainloop48
916mainloop48end:
917
918 mov ecx,diff
919 cmp ecx,0
920 jz end48
921
922 mov edx,mask
923 sal edx,24 //make low byte the high byte
924
925secondloop48:
926 sal edx,1 //move high bit to CF
927 jnc skip48 //if CF = 0
928 mov eax,[esi]
929 mov [ebx],eax
930skip48:
931 add esi,4
932 add ebx,4
933
934 dec ecx
935 jnz secondloop48
936
937end48:
938 emms
939 }
940 }
941 else /* mmx _not supported - Use modified C routine */
942 {
943 register unsigned int incr1, initial_val, final_val;
944 png_size_t pixel_bytes;
945 png_uint_32 i;
946 register int disp = png_pass_inc[png_ptr->pass];
947 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
948
949 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
950 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
951 pixel_bytes;
952 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
953 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
954 final_val = png_ptr->width*pixel_bytes;
955 incr1 = (disp)*pixel_bytes;
956 for (i = initial_val; i < final_val; i += incr1)
957 {
958 png_memcpy(dstptr, srcptr, pixel_bytes);
959 srcptr += incr1;
960 dstptr += incr1;
961 }
962 } /* end of else */
963
964 break;
965 } // end 48 bpp
966
967 default:
968 {
969 png_bytep sptr;
970 png_bytep dp;
971 png_size_t pixel_bytes;
972 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
973 unsigned int i;
974 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
975 register unsigned int incr1, initial_val, final_val;
976
977 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
978 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
979 pixel_bytes;
980 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
981 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
982 final_val = png_ptr->width*pixel_bytes;
983 incr1 = (disp)*pixel_bytes;
984 for (i = initial_val; i < final_val; i += incr1)
985 {
986 png_memcpy(dp, sptr, pixel_bytes);
987 sptr += incr1;
988 dp += incr1;
989 }
990 break;
991 }
992 } /* end switch (png_ptr->row_info.pixel_depth) */
993 } /* end if (non-trivial mask) */
994
995} /* end png_combine_row() */
996
997
998#if defined(PNG_READ_INTERLACING_SUPPORTED)
999
1000void /* PRIVATE */
1001png_do_read_interlace(png_structp png_ptr)
1002{
1003 png_row_infop row_info = &(png_ptr->row_info);
1004 png_bytep row = png_ptr->row_buf + 1;
1005 int pass = png_ptr->pass;
1006 png_uint_32 transformations = png_ptr->transformations;
1007#ifdef PNG_USE_LOCAL_ARRAYS
1008 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1009#endif
1010
1011 png_debug(1,"in png_do_read_interlace\n");
1012
1013 if (mmx_supported == 2) {
1014#if !defined(PNG_1_0_X)
1015 /* this should have happened in png_init_mmx_flags() already */
1016 png_warning(png_ptr, "asm_flags may not have been initialized");
1017#endif
1018 png_mmx_support();
1019 }
1020
1021 if (row != NULL && row_info != NULL)
1022 {
1023 png_uint_32 final_width;
1024
1025 final_width = row_info->width * png_pass_inc[pass];
1026
1027 switch (row_info->pixel_depth)
1028 {
1029 case 1:
1030 {
1031 png_bytep sp, dp;
1032 int sshift, dshift;
1033 int s_start, s_end, s_inc;
1034 png_byte v;
1035 png_uint_32 i;
1036 int j;
1037
1038 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1039 dp = row + (png_size_t)((final_width - 1) >> 3);
1040#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1041 if (transformations & PNG_PACKSWAP)
1042 {
1043 sshift = (int)((row_info->width + 7) & 7);
1044 dshift = (int)((final_width + 7) & 7);
1045 s_start = 7;
1046 s_end = 0;
1047 s_inc = -1;
1048 }
1049 else
1050#endif
1051 {
1052 sshift = 7 - (int)((row_info->width + 7) & 7);
1053 dshift = 7 - (int)((final_width + 7) & 7);
1054 s_start = 0;
1055 s_end = 7;
1056 s_inc = 1;
1057 }
1058
1059 for (i = row_info->width; i; i--)
1060 {
1061 v = (png_byte)((*sp >> sshift) & 0x1);
1062 for (j = 0; j < png_pass_inc[pass]; j++)
1063 {
1064 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1065 *dp |= (png_byte)(v << dshift);
1066 if (dshift == s_end)
1067 {
1068 dshift = s_start;
1069 dp--;
1070 }
1071 else
1072 dshift += s_inc;
1073 }
1074 if (sshift == s_end)
1075 {
1076 sshift = s_start;
1077 sp--;
1078 }
1079 else
1080 sshift += s_inc;
1081 }
1082 break;
1083 }
1084
1085 case 2:
1086 {
1087 png_bytep sp, dp;
1088 int sshift, dshift;
1089 int s_start, s_end, s_inc;
1090 png_uint_32 i;
1091
1092 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1093 dp = row + (png_size_t)((final_width - 1) >> 2);
1094#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1095 if (transformations & PNG_PACKSWAP)
1096 {
1097 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1098 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1099 s_start = 6;
1100 s_end = 0;
1101 s_inc = -2;
1102 }
1103 else
1104#endif
1105 {
1106 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1107 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1108 s_start = 0;
1109 s_end = 6;
1110 s_inc = 2;
1111 }
1112
1113 for (i = row_info->width; i; i--)
1114 {
1115 png_byte v;
1116 int j;
1117
1118 v = (png_byte)((*sp >> sshift) & 0x3);
1119 for (j = 0; j < png_pass_inc[pass]; j++)
1120 {
1121 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1122 *dp |= (png_byte)(v << dshift);
1123 if (dshift == s_end)
1124 {
1125 dshift = s_start;
1126 dp--;
1127 }
1128 else
1129 dshift += s_inc;
1130 }
1131 if (sshift == s_end)
1132 {
1133 sshift = s_start;
1134 sp--;
1135 }
1136 else
1137 sshift += s_inc;
1138 }
1139 break;
1140 }
1141
1142 case 4:
1143 {
1144 png_bytep sp, dp;
1145 int sshift, dshift;
1146 int s_start, s_end, s_inc;
1147 png_uint_32 i;
1148
1149 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1150 dp = row + (png_size_t)((final_width - 1) >> 1);
1151#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1152 if (transformations & PNG_PACKSWAP)
1153 {
1154 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1155 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1156 s_start = 4;
1157 s_end = 0;
1158 s_inc = -4;
1159 }
1160 else
1161#endif
1162 {
1163 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1164 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1165 s_start = 0;
1166 s_end = 4;
1167 s_inc = 4;
1168 }
1169
1170 for (i = row_info->width; i; i--)
1171 {
1172 png_byte v;
1173 int j;
1174
1175 v = (png_byte)((*sp >> sshift) & 0xf);
1176 for (j = 0; j < png_pass_inc[pass]; j++)
1177 {
1178 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1179 *dp |= (png_byte)(v << dshift);
1180 if (dshift == s_end)
1181 {
1182 dshift = s_start;
1183 dp--;
1184 }
1185 else
1186 dshift += s_inc;
1187 }
1188 if (sshift == s_end)
1189 {
1190 sshift = s_start;
1191 sp--;
1192 }
1193 else
1194 sshift += s_inc;
1195 }
1196 break;
1197 }
1198
1199 default: // This is the place where the routine is modified
1200 {
1201 __int64 const4 = 0x0000000000FFFFFF;
1202 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1203 __int64 const6 = 0x00000000000000FF;
1204 png_bytep sptr, dp;
1205 png_uint_32 i;
1206 png_size_t pixel_bytes;
1207 int width = row_info->width;
1208
1209 pixel_bytes = (row_info->pixel_depth >> 3);
1210
1211 sptr = row + (width - 1) * pixel_bytes;
1212 dp = row + (final_width - 1) * pixel_bytes;
1213 // New code by Nirav Chhatrapati - Intel Corporation
1214 // sign fix by GRR
1215 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1216
1217 // use MMX routine if machine supports it
1218#if !defined(PNG_1_0_X)
1219 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1220 /* && mmx_supported */ )
1221#else
1222 if (mmx_supported)
1223#endif
1224 {
1225 if (pixel_bytes == 3)
1226 {
1227 if (((pass == 0) || (pass == 1)) && width)
1228 {
1229 _asm
1230 {
1231 mov esi, sptr
1232 mov edi, dp
1233 mov ecx, width
1234 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1235loop_pass0:
1236 movd mm0, [esi] ; X X X X X v2 v1 v0
1237 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1238 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1239 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1240 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1241 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1242 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1243 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1244 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1245 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1246 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1247 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1248 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1249 movq [edi+16] , mm4
1250 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1251 movq [edi+8] , mm3
1252 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1253 sub esi, 3
1254 movq [edi], mm0
1255 sub edi, 24
1256 //sub esi, 3
1257 dec ecx
1258 jnz loop_pass0
1259 EMMS
1260 }
1261 }
1262 else if (((pass == 2) || (pass == 3)) && width)
1263 {
1264 _asm
1265 {
1266 mov esi, sptr
1267 mov edi, dp
1268 mov ecx, width
1269 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1270loop_pass2:
1271 movd mm0, [esi] ; X X X X X v2 v1 v0
1272 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1273 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1274 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1275 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1276 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1277 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1278 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1279 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1280 movq [edi+4], mm0 ; move to memory
1281 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1282 movd [edi], mm0 ; move to memory
1283 sub esi, 3
1284 sub edi, 12
1285 dec ecx
1286 jnz loop_pass2
1287 EMMS
1288 }
1289 }
1290 else if (width) /* && ((pass == 4) || (pass == 5)) */
1291 {
1292 int width_mmx = ((width >> 1) << 1) - 8;
1293 if (width_mmx < 0)
1294 width_mmx = 0;
1295 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1296 if (width_mmx)
1297 {
1298 _asm
1299 {
1300 mov esi, sptr
1301 mov edi, dp
1302 mov ecx, width_mmx
1303 sub esi, 3
1304 sub edi, 9
1305loop_pass4:
1306 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1307 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1308 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1309 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1310 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1311 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1312 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1313 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1314 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1315 movq [edi], mm0 ; move quad to memory
1316 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1317 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1318 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1319 movd [edi+8], mm6 ; move double to memory
1320 sub esi, 6
1321 sub edi, 12
1322 sub ecx, 2
1323 jnz loop_pass4
1324 EMMS
1325 }
1326 }
1327
1328 sptr -= width_mmx*3;
1329 dp -= width_mmx*6;
1330 for (i = width; i; i--)
1331 {
1332 png_byte v[8];
1333 int j;
1334
1335 png_memcpy(v, sptr, 3);
1336 for (j = 0; j < png_pass_inc[pass]; j++)
1337 {
1338 png_memcpy(dp, v, 3);
1339 dp -= 3;
1340 }
1341 sptr -= 3;
1342 }
1343 }
1344 } /* end of pixel_bytes == 3 */
1345
1346 else if (pixel_bytes == 1)
1347 {
1348 if (((pass == 0) || (pass == 1)) && width)
1349 {
1350 int width_mmx = ((width >> 2) << 2);
1351 width -= width_mmx;
1352 if (width_mmx)
1353 {
1354 _asm
1355 {
1356 mov esi, sptr
1357 mov edi, dp
1358 mov ecx, width_mmx
1359 sub edi, 31
1360 sub esi, 3
1361loop1_pass0:
1362 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1363 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1364 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1365 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1366 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1367 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1368 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1369 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1370 movq [edi], mm0 ; move to memory v3
1371 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1372 movq [edi+8], mm3 ; move to memory v2
1373 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1374 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1375 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1376 movq [edi+16], mm2 ; move to memory v1
1377 movq [edi+24], mm4 ; move to memory v0
1378 sub esi, 4
1379 sub edi, 32
1380 sub ecx, 4
1381 jnz loop1_pass0
1382 EMMS
1383 }
1384 }
1385
1386 sptr -= width_mmx;
1387 dp -= width_mmx*8;
1388 for (i = width; i; i--)
1389 {
1390 int j;
1391
1392 /* I simplified this part in version 1.0.4e
1393 * here and in several other instances where
1394 * pixel_bytes == 1 -- GR-P
1395 *
1396 * Original code:
1397 *
1398 * png_byte v[8];
1399 * png_memcpy(v, sptr, pixel_bytes);
1400 * for (j = 0; j < png_pass_inc[pass]; j++)
1401 * {
1402 * png_memcpy(dp, v, pixel_bytes);
1403 * dp -= pixel_bytes;
1404 * }
1405 * sptr -= pixel_bytes;
1406 *
1407 * Replacement code is in the next three lines:
1408 */
1409
1410 for (j = 0; j < png_pass_inc[pass]; j++)
1411 *dp-- = *sptr;
1412 sptr--;
1413 }
1414 }
1415 else if (((pass == 2) || (pass == 3)) && width)
1416 {
1417 int width_mmx = ((width >> 2) << 2);
1418 width -= width_mmx;
1419 if (width_mmx)
1420 {
1421 _asm
1422 {
1423 mov esi, sptr
1424 mov edi, dp
1425 mov ecx, width_mmx
1426 sub edi, 15
1427 sub esi, 3
1428loop1_pass2:
1429 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1430 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1431 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1432 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1433 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1434 movq [edi], mm0 ; move to memory v2 and v3
1435 sub esi, 4
1436 movq [edi+8], mm1 ; move to memory v1 and v0
1437 sub edi, 16
1438 sub ecx, 4
1439 jnz loop1_pass2
1440 EMMS
1441 }
1442 }
1443
1444 sptr -= width_mmx;
1445 dp -= width_mmx*4;
1446 for (i = width; i; i--)
1447 {
1448 int j;
1449
1450 for (j = 0; j < png_pass_inc[pass]; j++)
1451 {
1452 *dp-- = *sptr;
1453 }
1454 sptr --;
1455 }
1456 }
1457 else if (width) /* && ((pass == 4) || (pass == 5))) */
1458 {
1459 int width_mmx = ((width >> 3) << 3);
1460 width -= width_mmx;
1461 if (width_mmx)
1462 {
1463 _asm
1464 {
1465 mov esi, sptr
1466 mov edi, dp
1467 mov ecx, width_mmx
1468 sub edi, 15
1469 sub esi, 7
1470loop1_pass4:
1471 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1472 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1473 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1474 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1475 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1476 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1477 sub esi, 8
1478 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1479 //sub esi, 4
1480 sub edi, 16
1481 sub ecx, 8
1482 jnz loop1_pass4
1483 EMMS
1484 }
1485 }
1486
1487 sptr -= width_mmx;
1488 dp -= width_mmx*2;
1489 for (i = width; i; i--)
1490 {
1491 int j;
1492
1493 for (j = 0; j < png_pass_inc[pass]; j++)
1494 {
1495 *dp-- = *sptr;
1496 }
1497 sptr --;
1498 }
1499 }
1500 } /* end of pixel_bytes == 1 */
1501
1502 else if (pixel_bytes == 2)
1503 {
1504 if (((pass == 0) || (pass == 1)) && width)
1505 {
1506 int width_mmx = ((width >> 1) << 1);
1507 width -= width_mmx;
1508 if (width_mmx)
1509 {
1510 _asm
1511 {
1512 mov esi, sptr
1513 mov edi, dp
1514 mov ecx, width_mmx
1515 sub esi, 2
1516 sub edi, 30
1517loop2_pass0:
1518 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1519 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1520 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1521 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1522 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1523 movq [edi], mm0
1524 movq [edi + 8], mm0
1525 movq [edi + 16], mm1
1526 movq [edi + 24], mm1
1527 sub esi, 4
1528 sub edi, 32
1529 sub ecx, 2
1530 jnz loop2_pass0
1531 EMMS
1532 }
1533 }
1534
1535 sptr -= (width_mmx*2 - 2); // sign fixed
1536 dp -= (width_mmx*16 - 2); // sign fixed
1537 for (i = width; i; i--)
1538 {
1539 png_byte v[8];
1540 int j;
1541 sptr -= 2;
1542 png_memcpy(v, sptr, 2);
1543 for (j = 0; j < png_pass_inc[pass]; j++)
1544 {
1545 dp -= 2;
1546 png_memcpy(dp, v, 2);
1547 }
1548 }
1549 }
1550 else if (((pass == 2) || (pass == 3)) && width)
1551 {
1552 int width_mmx = ((width >> 1) << 1) ;
1553 width -= width_mmx;
1554 if (width_mmx)
1555 {
1556 _asm
1557 {
1558 mov esi, sptr
1559 mov edi, dp
1560 mov ecx, width_mmx
1561 sub esi, 2
1562 sub edi, 14
1563loop2_pass2:
1564 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1565 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1566 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1567 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1568 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1569 movq [edi], mm0
1570 sub esi, 4
1571 movq [edi + 8], mm1
1572 //sub esi, 4
1573 sub edi, 16
1574 sub ecx, 2
1575 jnz loop2_pass2
1576 EMMS
1577 }
1578 }
1579
1580 sptr -= (width_mmx*2 - 2); // sign fixed
1581 dp -= (width_mmx*8 - 2); // sign fixed
1582 for (i = width; i; i--)
1583 {
1584 png_byte v[8];
1585 int j;
1586 sptr -= 2;
1587 png_memcpy(v, sptr, 2);
1588 for (j = 0; j < png_pass_inc[pass]; j++)
1589 {
1590 dp -= 2;
1591 png_memcpy(dp, v, 2);
1592 }
1593 }
1594 }
1595 else if (width) // pass == 4 or 5
1596 {
1597 int width_mmx = ((width >> 1) << 1) ;
1598 width -= width_mmx;
1599 if (width_mmx)
1600 {
1601 _asm
1602 {
1603 mov esi, sptr
1604 mov edi, dp
1605 mov ecx, width_mmx
1606 sub esi, 2
1607 sub edi, 6
1608loop2_pass4:
1609 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1610 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1611 sub esi, 4
1612 movq [edi], mm0
1613 sub edi, 8
1614 sub ecx, 2
1615 jnz loop2_pass4
1616 EMMS
1617 }
1618 }
1619
1620 sptr -= (width_mmx*2 - 2); // sign fixed
1621 dp -= (width_mmx*4 - 2); // sign fixed
1622 for (i = width; i; i--)
1623 {
1624 png_byte v[8];
1625 int j;
1626 sptr -= 2;
1627 png_memcpy(v, sptr, 2);
1628 for (j = 0; j < png_pass_inc[pass]; j++)
1629 {
1630 dp -= 2;
1631 png_memcpy(dp, v, 2);
1632 }
1633 }
1634 }
1635 } /* end of pixel_bytes == 2 */
1636
1637 else if (pixel_bytes == 4)
1638 {
1639 if (((pass == 0) || (pass == 1)) && width)
1640 {
1641 int width_mmx = ((width >> 1) << 1) ;
1642 width -= width_mmx;
1643 if (width_mmx)
1644 {
1645 _asm
1646 {
1647 mov esi, sptr
1648 mov edi, dp
1649 mov ecx, width_mmx
1650 sub esi, 4
1651 sub edi, 60
1652loop4_pass0:
1653 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1654 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1655 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1656 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1657 movq [edi], mm0
1658 movq [edi + 8], mm0
1659 movq [edi + 16], mm0
1660 movq [edi + 24], mm0
1661 movq [edi+32], mm1
1662 movq [edi + 40], mm1
1663 movq [edi+ 48], mm1
1664 sub esi, 8
1665 movq [edi + 56], mm1
1666 sub edi, 64
1667 sub ecx, 2
1668 jnz loop4_pass0
1669 EMMS
1670 }
1671 }
1672
1673 sptr -= (width_mmx*4 - 4); // sign fixed
1674 dp -= (width_mmx*32 - 4); // sign fixed
1675 for (i = width; i; i--)
1676 {
1677 png_byte v[8];
1678 int j;
1679 sptr -= 4;
1680 png_memcpy(v, sptr, 4);
1681 for (j = 0; j < png_pass_inc[pass]; j++)
1682 {
1683 dp -= 4;
1684 png_memcpy(dp, v, 4);
1685 }
1686 }
1687 }
1688 else if (((pass == 2) || (pass == 3)) && width)
1689 {
1690 int width_mmx = ((width >> 1) << 1) ;
1691 width -= width_mmx;
1692 if (width_mmx)
1693 {
1694 _asm
1695 {
1696 mov esi, sptr
1697 mov edi, dp
1698 mov ecx, width_mmx
1699 sub esi, 4
1700 sub edi, 28
1701loop4_pass2:
1702 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1703 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1704 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1705 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1706 movq [edi], mm0
1707 movq [edi + 8], mm0
1708 movq [edi+16], mm1
1709 movq [edi + 24], mm1
1710 sub esi, 8
1711 sub edi, 32
1712 sub ecx, 2
1713 jnz loop4_pass2
1714 EMMS
1715 }
1716 }
1717
1718 sptr -= (width_mmx*4 - 4); // sign fixed
1719 dp -= (width_mmx*16 - 4); // sign fixed
1720 for (i = width; i; i--)
1721 {
1722 png_byte v[8];
1723 int j;
1724 sptr -= 4;
1725 png_memcpy(v, sptr, 4);
1726 for (j = 0; j < png_pass_inc[pass]; j++)
1727 {
1728 dp -= 4;
1729 png_memcpy(dp, v, 4);
1730 }
1731 }
1732 }
1733 else if (width) // pass == 4 or 5
1734 {
1735 int width_mmx = ((width >> 1) << 1) ;
1736 width -= width_mmx;
1737 if (width_mmx)
1738 {
1739 _asm
1740 {
1741 mov esi, sptr
1742 mov edi, dp
1743 mov ecx, width_mmx
1744 sub esi, 4
1745 sub edi, 12
1746loop4_pass4:
1747 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1748 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1749 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1750 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1751 movq [edi], mm0
1752 sub esi, 8
1753 movq [edi + 8], mm1
1754 sub edi, 16
1755 sub ecx, 2
1756 jnz loop4_pass4
1757 EMMS
1758 }
1759 }
1760
1761 sptr -= (width_mmx*4 - 4); // sign fixed
1762 dp -= (width_mmx*8 - 4); // sign fixed
1763 for (i = width; i; i--)
1764 {
1765 png_byte v[8];
1766 int j;
1767 sptr -= 4;
1768 png_memcpy(v, sptr, 4);
1769 for (j = 0; j < png_pass_inc[pass]; j++)
1770 {
1771 dp -= 4;
1772 png_memcpy(dp, v, 4);
1773 }
1774 }
1775 }
1776
1777 } /* end of pixel_bytes == 4 */
1778
1779 else if (pixel_bytes == 6)
1780 {
1781 for (i = width; i; i--)
1782 {
1783 png_byte v[8];
1784 int j;
1785 png_memcpy(v, sptr, 6);
1786 for (j = 0; j < png_pass_inc[pass]; j++)
1787 {
1788 png_memcpy(dp, v, 6);
1789 dp -= 6;
1790 }
1791 sptr -= 6;
1792 }
1793 } /* end of pixel_bytes == 6 */
1794
1795 else
1796 {
1797 for (i = width; i; i--)
1798 {
1799 png_byte v[8];
1800 int j;
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1803 {
1804 png_memcpy(dp, v, pixel_bytes);
1805 dp -= pixel_bytes;
1806 }
1807 sptr-= pixel_bytes;
1808 }
1809 }
1810 } /* end of mmx_supported */
1811
1812 else /* MMX not supported: use modified C code - takes advantage
1813 * of inlining of memcpy for a constant */
1814 {
1815 if (pixel_bytes == 1)
1816 {
1817 for (i = width; i; i--)
1818 {
1819 int j;
1820 for (j = 0; j < png_pass_inc[pass]; j++)
1821 *dp-- = *sptr;
1822 sptr--;
1823 }
1824 }
1825 else if (pixel_bytes == 3)
1826 {
1827 for (i = width; i; i--)
1828 {
1829 png_byte v[8];
1830 int j;
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1833 {
1834 png_memcpy(dp, v, pixel_bytes);
1835 dp -= pixel_bytes;
1836 }
1837 sptr -= pixel_bytes;
1838 }
1839 }
1840 else if (pixel_bytes == 2)
1841 {
1842 for (i = width; i; i--)
1843 {
1844 png_byte v[8];
1845 int j;
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1848 {
1849 png_memcpy(dp, v, pixel_bytes);
1850 dp -= pixel_bytes;
1851 }
1852 sptr -= pixel_bytes;
1853 }
1854 }
1855 else if (pixel_bytes == 4)
1856 {
1857 for (i = width; i; i--)
1858 {
1859 png_byte v[8];
1860 int j;
1861 png_memcpy(v, sptr, pixel_bytes);
1862 for (j = 0; j < png_pass_inc[pass]; j++)
1863 {
1864 png_memcpy(dp, v, pixel_bytes);
1865 dp -= pixel_bytes;
1866 }
1867 sptr -= pixel_bytes;
1868 }
1869 }
1870 else if (pixel_bytes == 6)
1871 {
1872 for (i = width; i; i--)
1873 {
1874 png_byte v[8];
1875 int j;
1876 png_memcpy(v, sptr, pixel_bytes);
1877 for (j = 0; j < png_pass_inc[pass]; j++)
1878 {
1879 png_memcpy(dp, v, pixel_bytes);
1880 dp -= pixel_bytes;
1881 }
1882 sptr -= pixel_bytes;
1883 }
1884 }
1885 else
1886 {
1887 for (i = width; i; i--)
1888 {
1889 png_byte v[8];
1890 int j;
1891 png_memcpy(v, sptr, pixel_bytes);
1892 for (j = 0; j < png_pass_inc[pass]; j++)
1893 {
1894 png_memcpy(dp, v, pixel_bytes);
1895 dp -= pixel_bytes;
1896 }
1897 sptr -= pixel_bytes;
1898 }
1899 }
1900
1901 } /* end of MMX not supported */
1902 break;
1903 }
1904 } /* end switch (row_info->pixel_depth) */
1905
1906 row_info->width = final_width;
1907
1908 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1909 }
1910
1911}
1912
1913#endif /* PNG_READ_INTERLACING_SUPPORTED */
1914
1915
1916// These variables are utilized in the functions below. They are declared
1917// globally here to ensure alignment on 8-byte boundaries.
1918
1919union uAll {
1920 __int64 use;
1921 double align;
1922} LBCarryMask = {0x0101010101010101},
1923 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1924 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1925
1926
1927// Optimized code for PNG Average filter decoder
1928void /* PRIVATE */
1929png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1930 , png_bytep prev_row)
1931{
1932 int bpp;
1933 png_uint_32 FullLength;
1934 png_uint_32 MMXLength;
1935 //png_uint_32 len;
1936 int diff;
1937
1938 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1939 FullLength = row_info->rowbytes; // # of bytes to filter
1940 _asm {
1941 // Init address pointers and offset
1942 mov edi, row // edi ==> Avg(x)
1943 xor ebx, ebx // ebx ==> x
1944 mov edx, edi
1945 mov esi, prev_row // esi ==> Prior(x)
1946 sub edx, bpp // edx ==> Raw(x-bpp)
1947
1948 xor eax, eax
1949 // Compute the Raw value for the first bpp bytes
1950 // Raw(x) = Avg(x) + (Prior(x)/2)
1951davgrlp:
1952 mov al, [esi + ebx] // Load al with Prior(x)
1953 inc ebx
1954 shr al, 1 // divide by 2
1955 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1956 cmp ebx, bpp
1957 mov [edi+ebx-1], al // Write back Raw(x);
1958 // mov does not affect flags; -1 to offset inc ebx
1959 jb davgrlp
1960 // get # of bytes to alignment
1961 mov diff, edi // take start of row
1962 add diff, ebx // add bpp
1963 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1964 and diff, 0xfffffff8 // mask to alignment boundary
1965 sub diff, edi // subtract from start ==> value ebx at alignment
1966 jz davggo
1967 // fix alignment
1968 // Compute the Raw value for the bytes upto the alignment boundary
1969 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1970 xor ecx, ecx
1971davglp1:
1972 xor eax, eax
1973 mov cl, [esi + ebx] // load cl with Prior(x)
1974 mov al, [edx + ebx] // load al with Raw(x-bpp)
1975 add ax, cx
1976 inc ebx
1977 shr ax, 1 // divide by 2
1978 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1979 cmp ebx, diff // Check if at alignment boundary
1980 mov [edi+ebx-1], al // Write back Raw(x);
1981 // mov does not affect flags; -1 to offset inc ebx
1982 jb davglp1 // Repeat until at alignment boundary
1983davggo:
1984 mov eax, FullLength
1985 mov ecx, eax
1986 sub eax, ebx // subtract alignment fix
1987 and eax, 0x00000007 // calc bytes over mult of 8
1988 sub ecx, eax // drop over bytes from original length
1989 mov MMXLength, ecx
1990 } // end _asm block
1991 // Now do the math for the rest of the row
1992 switch ( bpp )
1993 {
1994 case 3:
1995 {
1996 ActiveMask.use = 0x0000000000ffffff;
1997 ShiftBpp.use = 24; // == 3 * 8
1998 ShiftRem.use = 40; // == 64 - 24
1999 _asm {
2000 // Re-init address pointers and offset
2001 movq mm7, ActiveMask
2002 mov ebx, diff // ebx ==> x = offset to alignment boundary
2003 movq mm5, LBCarryMask
2004 mov edi, row // edi ==> Avg(x)
2005 movq mm4, HBClearMask
2006 mov esi, prev_row // esi ==> Prior(x)
2007 // PRIME the pump (load the first Raw(x-bpp) data set
2008 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2009 // (we correct position in loop below)
2010davg3lp:
2011 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
2012 // Add (Prev_row/2) to Average
2013 movq mm3, mm5
2014 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
2015 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
2016 movq mm6, mm7
2017 pand mm3, mm1 // get lsb for each prev_row byte
2018 psrlq mm1, 1 // divide prev_row bytes by 2
2019 pand mm1, mm4 // clear invalid bit 7 of each byte
2020 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2021 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2022 movq mm1, mm3 // now use mm1 for getting LBCarrys
2023 pand mm1, mm2 // get LBCarrys for each byte where both
2024 // lsb's were == 1 (Only valid for active group)
2025 psrlq mm2, 1 // divide raw bytes by 2
2026 pand mm2, mm4 // clear invalid bit 7 of each byte
2027 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2028 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2029 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2030 // byte
2031 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2032 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2033 movq mm2, mm0 // mov updated Raws to mm2
2034 psllq mm2, ShiftBpp // shift data to position correctly
2035 movq mm1, mm3 // now use mm1 for getting LBCarrys
2036 pand mm1, mm2 // get LBCarrys for each byte where both
2037 // lsb's were == 1 (Only valid for active group)
2038 psrlq mm2, 1 // divide raw bytes by 2
2039 pand mm2, mm4 // clear invalid bit 7 of each byte
2040 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2041 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2042 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2043 // byte
2044
2045 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2046 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2047 // bytes
2048 movq mm2, mm0 // mov updated Raws to mm2
2049 psllq mm2, ShiftBpp // shift data to position correctly
2050 // Data only needs to be shifted once here to
2051 // get the correct x-bpp offset.
2052 movq mm1, mm3 // now use mm1 for getting LBCarrys
2053 pand mm1, mm2 // get LBCarrys for each byte where both
2054 // lsb's were == 1 (Only valid for active group)
2055 psrlq mm2, 1 // divide raw bytes by 2
2056 pand mm2, mm4 // clear invalid bit 7 of each byte
2057 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2058 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2059 add ebx, 8
2060 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2061 // byte
2062
2063 // Now ready to write back to memory
2064 movq [edi + ebx - 8], mm0
2065 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2066 cmp ebx, MMXLength
2067 movq mm2, mm0 // mov updated Raw(x) to mm2
2068 jb davg3lp
2069 } // end _asm block
2070 }
2071 break;
2072
2073 case 6:
2074 case 4:
2075 case 7:
2076 case 5:
2077 {
2078 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2079 // appropriate inactive bytes
2080 ShiftBpp.use = bpp << 3;
2081 ShiftRem.use = 64 - ShiftBpp.use;
2082 _asm {
2083 movq mm4, HBClearMask
2084 // Re-init address pointers and offset
2085 mov ebx, diff // ebx ==> x = offset to alignment boundary
2086 // Load ActiveMask and clear all bytes except for 1st active group
2087 movq mm7, ActiveMask
2088 mov edi, row // edi ==> Avg(x)
2089 psrlq mm7, ShiftRem
2090 mov esi, prev_row // esi ==> Prior(x)
2091 movq mm6, mm7
2092 movq mm5, LBCarryMask
2093 psllq mm6, ShiftBpp // Create mask for 2nd active group
2094 // PRIME the pump (load the first Raw(x-bpp) data set
2095 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2096 // (we correct position in loop below)
2097davg4lp:
2098 movq mm0, [edi + ebx]
2099 psrlq mm2, ShiftRem // shift data to position correctly
2100 movq mm1, [esi + ebx]
2101 // Add (Prev_row/2) to Average
2102 movq mm3, mm5
2103 pand mm3, mm1 // get lsb for each prev_row byte
2104 psrlq mm1, 1 // divide prev_row bytes by 2
2105 pand mm1, mm4 // clear invalid bit 7 of each byte
2106 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2107 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2108 movq mm1, mm3 // now use mm1 for getting LBCarrys
2109 pand mm1, mm2 // get LBCarrys for each byte where both
2110 // lsb's were == 1 (Only valid for active group)
2111 psrlq mm2, 1 // divide raw bytes by 2
2112 pand mm2, mm4 // clear invalid bit 7 of each byte
2113 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2114 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2115 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2116 // byte
2117 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2118 movq mm2, mm0 // mov updated Raws to mm2
2119 psllq mm2, ShiftBpp // shift data to position correctly
2120 add ebx, 8
2121 movq mm1, mm3 // now use mm1 for getting LBCarrys
2122 pand mm1, mm2 // get LBCarrys for each byte where both
2123 // lsb's were == 1 (Only valid for active group)
2124 psrlq mm2, 1 // divide raw bytes by 2
2125 pand mm2, mm4 // clear invalid bit 7 of each byte
2126 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2127 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2128 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2129 // byte
2130 cmp ebx, MMXLength
2131 // Now ready to write back to memory
2132 movq [edi + ebx - 8], mm0
2133 // Prep Raw(x-bpp) for next loop
2134 movq mm2, mm0 // mov updated Raws to mm2
2135 jb davg4lp
2136 } // end _asm block
2137 }
2138 break;
2139 case 2:
2140 {
2141 ActiveMask.use = 0x000000000000ffff;
2142 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2143 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2144 _asm {
2145 // Load ActiveMask
2146 movq mm7, ActiveMask
2147 // Re-init address pointers and offset
2148 mov ebx, diff // ebx ==> x = offset to alignment boundary
2149 movq mm5, LBCarryMask
2150 mov edi, row // edi ==> Avg(x)
2151 movq mm4, HBClearMask
2152 mov esi, prev_row // esi ==> Prior(x)
2153 // PRIME the pump (load the first Raw(x-bpp) data set
2154 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2155 // (we correct position in loop below)
2156davg2lp:
2157 movq mm0, [edi + ebx]
2158 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2159 movq mm1, [esi + ebx]
2160 // Add (Prev_row/2) to Average
2161 movq mm3, mm5
2162 pand mm3, mm1 // get lsb for each prev_row byte
2163 psrlq mm1, 1 // divide prev_row bytes by 2
2164 pand mm1, mm4 // clear invalid bit 7 of each byte
2165 movq mm6, mm7
2166 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2167 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2168 movq mm1, mm3 // now use mm1 for getting LBCarrys
2169 pand mm1, mm2 // get LBCarrys for each byte where both
2170 // lsb's were == 1 (Only valid for active group)
2171 psrlq mm2, 1 // divide raw bytes by 2
2172 pand mm2, mm4 // clear invalid bit 7 of each byte
2173 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2174 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2175 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2176 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2177 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2178 movq mm2, mm0 // mov updated Raws to mm2
2179 psllq mm2, ShiftBpp // shift data to position correctly
2180 movq mm1, mm3 // now use mm1 for getting LBCarrys
2181 pand mm1, mm2 // get LBCarrys for each byte where both
2182 // lsb's were == 1 (Only valid for active group)
2183 psrlq mm2, 1 // divide raw bytes by 2
2184 pand mm2, mm4 // clear invalid bit 7 of each byte
2185 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2186 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2187 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2188
2189 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2190 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2191 movq mm2, mm0 // mov updated Raws to mm2
2192 psllq mm2, ShiftBpp // shift data to position correctly
2193 // Data only needs to be shifted once here to
2194 // get the correct x-bpp offset.
2195 movq mm1, mm3 // now use mm1 for getting LBCarrys
2196 pand mm1, mm2 // get LBCarrys for each byte where both
2197 // lsb's were == 1 (Only valid for active group)
2198 psrlq mm2, 1 // divide raw bytes by 2
2199 pand mm2, mm4 // clear invalid bit 7 of each byte
2200 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2201 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2202 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2203
2204 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2205 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2206 movq mm2, mm0 // mov updated Raws to mm2
2207 psllq mm2, ShiftBpp // shift data to position correctly
2208 // Data only needs to be shifted once here to
2209 // get the correct x-bpp offset.
2210 add ebx, 8
2211 movq mm1, mm3 // now use mm1 for getting LBCarrys
2212 pand mm1, mm2 // get LBCarrys for each byte where both
2213 // lsb's were == 1 (Only valid for active group)
2214 psrlq mm2, 1 // divide raw bytes by 2
2215 pand mm2, mm4 // clear invalid bit 7 of each byte
2216 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2217 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2218 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2219
2220 cmp ebx, MMXLength
2221 // Now ready to write back to memory
2222 movq [edi + ebx - 8], mm0
2223 // Prep Raw(x-bpp) for next loop
2224 movq mm2, mm0 // mov updated Raws to mm2
2225 jb davg2lp
2226 } // end _asm block
2227 }
2228 break;
2229
2230 case 1: // bpp == 1
2231 {
2232 _asm {
2233 // Re-init address pointers and offset
2234 mov ebx, diff // ebx ==> x = offset to alignment boundary
2235 mov edi, row // edi ==> Avg(x)
2236 cmp ebx, FullLength // Test if offset at end of array
2237 jnb davg1end
2238 // Do Paeth decode for remaining bytes
2239 mov esi, prev_row // esi ==> Prior(x)
2240 mov edx, edi
2241 xor ecx, ecx // zero ecx before using cl & cx in loop below
2242 sub edx, bpp // edx ==> Raw(x-bpp)
2243davg1lp:
2244 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2245 xor eax, eax
2246 mov cl, [esi + ebx] // load cl with Prior(x)
2247 mov al, [edx + ebx] // load al with Raw(x-bpp)
2248 add ax, cx
2249 inc ebx
2250 shr ax, 1 // divide by 2
2251 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2252 cmp ebx, FullLength // Check if at end of array
2253 mov [edi+ebx-1], al // Write back Raw(x);
2254 // mov does not affect flags; -1 to offset inc ebx
2255 jb davg1lp
2256davg1end:
2257 } // end _asm block
2258 }
2259 return;
2260
2261 case 8: // bpp == 8
2262 {
2263 _asm {
2264 // Re-init address pointers and offset
2265 mov ebx, diff // ebx ==> x = offset to alignment boundary
2266 movq mm5, LBCarryMask
2267 mov edi, row // edi ==> Avg(x)
2268 movq mm4, HBClearMask
2269 mov esi, prev_row // esi ==> Prior(x)
2270 // PRIME the pump (load the first Raw(x-bpp) data set
2271 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2272 // (NO NEED to correct position in loop below)
2273davg8lp:
2274 movq mm0, [edi + ebx]
2275 movq mm3, mm5
2276 movq mm1, [esi + ebx]
2277 add ebx, 8
2278 pand mm3, mm1 // get lsb for each prev_row byte
2279 psrlq mm1, 1 // divide prev_row bytes by 2
2280 pand mm3, mm2 // get LBCarrys for each byte where both
2281 // lsb's were == 1
2282 psrlq mm2, 1 // divide raw bytes by 2
2283 pand mm1, mm4 // clear invalid bit 7 of each byte
2284 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2285 pand mm2, mm4 // clear invalid bit 7 of each byte
2286 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2287 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2288 cmp ebx, MMXLength
2289 movq [edi + ebx - 8], mm0
2290 movq mm2, mm0 // reuse as Raw(x-bpp)
2291 jb davg8lp
2292 } // end _asm block
2293 }
2294 break;
2295 default: // bpp greater than 8
2296 {
2297 _asm {
2298 movq mm5, LBCarryMask
2299 // Re-init address pointers and offset
2300 mov ebx, diff // ebx ==> x = offset to alignment boundary
2301 mov edi, row // edi ==> Avg(x)
2302 movq mm4, HBClearMask
2303 mov edx, edi
2304 mov esi, prev_row // esi ==> Prior(x)
2305 sub edx, bpp // edx ==> Raw(x-bpp)
2306davgAlp:
2307 movq mm0, [edi + ebx]
2308 movq mm3, mm5
2309 movq mm1, [esi + ebx]
2310 pand mm3, mm1 // get lsb for each prev_row byte
2311 movq mm2, [edx + ebx]
2312 psrlq mm1, 1 // divide prev_row bytes by 2
2313 pand mm3, mm2 // get LBCarrys for each byte where both
2314 // lsb's were == 1
2315 psrlq mm2, 1 // divide raw bytes by 2
2316 pand mm1, mm4 // clear invalid bit 7 of each byte
2317 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2318 pand mm2, mm4 // clear invalid bit 7 of each byte
2319 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2320 add ebx, 8
2321 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2322 cmp ebx, MMXLength
2323 movq [edi + ebx - 8], mm0
2324 jb davgAlp
2325 } // end _asm block
2326 }
2327 break;
2328 } // end switch ( bpp )
2329
2330 _asm {
2331 // MMX acceleration complete now do clean-up
2332 // Check if any remaining bytes left to decode
2333 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2334 mov edi, row // edi ==> Avg(x)
2335 cmp ebx, FullLength // Test if offset at end of array
2336 jnb davgend
2337 // Do Paeth decode for remaining bytes
2338 mov esi, prev_row // esi ==> Prior(x)
2339 mov edx, edi
2340 xor ecx, ecx // zero ecx before using cl & cx in loop below
2341 sub edx, bpp // edx ==> Raw(x-bpp)
2342davglp2:
2343 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2344 xor eax, eax
2345 mov cl, [esi + ebx] // load cl with Prior(x)
2346 mov al, [edx + ebx] // load al with Raw(x-bpp)
2347 add ax, cx
2348 inc ebx
2349 shr ax, 1 // divide by 2
2350 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2351 cmp ebx, FullLength // Check if at end of array
2352 mov [edi+ebx-1], al // Write back Raw(x);
2353 // mov does not affect flags; -1 to offset inc ebx
2354 jb davglp2
2355davgend:
2356 emms // End MMX instructions; prep for possible FP instrs.
2357 } // end _asm block
2358}
2359
2360// Optimized code for PNG Paeth filter decoder
2361void /* PRIVATE */
2362png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2363 png_bytep prev_row)
2364{
2365 png_uint_32 FullLength;
2366 png_uint_32 MMXLength;
2367 //png_uint_32 len;
2368 int bpp;
2369 int diff;
2370 //int ptemp;
2371 int patemp, pbtemp, pctemp;
2372
2373 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2374 FullLength = row_info->rowbytes; // # of bytes to filter
2375 _asm
2376 {
2377 xor ebx, ebx // ebx ==> x offset
2378 mov edi, row
2379 xor edx, edx // edx ==> x-bpp offset
2380 mov esi, prev_row
2381 xor eax, eax
2382
2383 // Compute the Raw value for the first bpp bytes
2384 // Note: the formula works out to be always
2385 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2386dpthrlp:
2387 mov al, [edi + ebx]
2388 add al, [esi + ebx]
2389 inc ebx
2390 cmp ebx, bpp
2391 mov [edi + ebx - 1], al
2392 jb dpthrlp
2393 // get # of bytes to alignment
2394 mov diff, edi // take start of row
2395 add diff, ebx // add bpp
2396 xor ecx, ecx
2397 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2398 and diff, 0xfffffff8 // mask to alignment boundary
2399 sub diff, edi // subtract from start ==> value ebx at alignment
2400 jz dpthgo
2401 // fix alignment
2402dpthlp1:
2403 xor eax, eax
2404 // pav = p - a = (a + b - c) - a = b - c
2405 mov al, [esi + ebx] // load Prior(x) into al
2406 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2407 sub eax, ecx // subtract Prior(x-bpp)
2408 mov patemp, eax // Save pav for later use
2409 xor eax, eax
2410 // pbv = p - b = (a + b - c) - b = a - c
2411 mov al, [edi + edx] // load Raw(x-bpp) into al
2412 sub eax, ecx // subtract Prior(x-bpp)
2413 mov ecx, eax
2414 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2415 add eax, patemp // pcv = pav + pbv
2416 // pc = abs(pcv)
2417 test eax, 0x80000000
2418 jz dpthpca
2419 neg eax // reverse sign of neg values
2420dpthpca:
2421 mov pctemp, eax // save pc for later use
2422 // pb = abs(pbv)
2423 test ecx, 0x80000000
2424 jz dpthpba
2425 neg ecx // reverse sign of neg values
2426dpthpba:
2427 mov pbtemp, ecx // save pb for later use
2428 // pa = abs(pav)
2429 mov eax, patemp
2430 test eax, 0x80000000
2431 jz dpthpaa
2432 neg eax // reverse sign of neg values
2433dpthpaa:
2434 mov patemp, eax // save pa for later use
2435 // test if pa <= pb
2436 cmp eax, ecx
2437 jna dpthabb
2438 // pa > pb; now test if pb <= pc
2439 cmp ecx, pctemp
2440 jna dpthbbc
2441 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2442 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2443 jmp dpthpaeth
2444dpthbbc:
2445 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2446 mov cl, [esi + ebx] // load Prior(x) into cl
2447 jmp dpthpaeth
2448dpthabb:
2449 // pa <= pb; now test if pa <= pc
2450 cmp eax, pctemp
2451 jna dpthabc
2452 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2453 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2454 jmp dpthpaeth
2455dpthabc:
2456 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2457 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2458dpthpaeth:
2459 inc ebx
2460 inc edx
2461 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2462 add [edi + ebx - 1], cl
2463 cmp ebx, diff
2464 jb dpthlp1
2465dpthgo:
2466 mov ecx, FullLength
2467 mov eax, ecx
2468 sub eax, ebx // subtract alignment fix
2469 and eax, 0x00000007 // calc bytes over mult of 8
2470 sub ecx, eax // drop over bytes from original length
2471 mov MMXLength, ecx
2472 } // end _asm block
2473 // Now do the math for the rest of the row
2474 switch ( bpp )
2475 {
2476 case 3:
2477 {
2478 ActiveMask.use = 0x0000000000ffffff;
2479 ActiveMaskEnd.use = 0xffff000000000000;
2480 ShiftBpp.use = 24; // == bpp(3) * 8
2481 ShiftRem.use = 40; // == 64 - 24
2482 _asm
2483 {
2484 mov ebx, diff
2485 mov edi, row
2486 mov esi, prev_row
2487 pxor mm0, mm0
2488 // PRIME the pump (load the first Raw(x-bpp) data set
2489 movq mm1, [edi+ebx-8]
2490dpth3lp:
2491 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2492 movq mm2, [esi + ebx] // load b=Prior(x)
2493 punpcklbw mm1, mm0 // Unpack High bytes of a
2494 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2495 punpcklbw mm2, mm0 // Unpack High bytes of b
2496 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2497 // pav = p - a = (a + b - c) - a = b - c
2498 movq mm4, mm2
2499 punpcklbw mm3, mm0 // Unpack High bytes of c
2500 // pbv = p - b = (a + b - c) - b = a - c
2501 movq mm5, mm1
2502 psubw mm4, mm3
2503 pxor mm7, mm7
2504 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2505 movq mm6, mm4
2506 psubw mm5, mm3
2507
2508 // pa = abs(p-a) = abs(pav)
2509 // pb = abs(p-b) = abs(pbv)
2510 // pc = abs(p-c) = abs(pcv)
2511 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2512 paddw mm6, mm5
2513 pand mm0, mm4 // Only pav bytes < 0 in mm7
2514 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2515 psubw mm4, mm0
2516 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2517 psubw mm4, mm0
2518 psubw mm5, mm7
2519 pxor mm0, mm0
2520 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2521 pand mm0, mm6 // Only pav bytes < 0 in mm7
2522 psubw mm5, mm7
2523 psubw mm6, mm0
2524 // test pa <= pb
2525 movq mm7, mm4
2526 psubw mm6, mm0
2527 pcmpgtw mm7, mm5 // pa > pb?
2528 movq mm0, mm7
2529 // use mm7 mask to merge pa & pb
2530 pand mm5, mm7
2531 // use mm0 mask copy to merge a & b
2532 pand mm2, mm0
2533 pandn mm7, mm4
2534 pandn mm0, mm1
2535 paddw mm7, mm5
2536 paddw mm0, mm2
2537 // test ((pa <= pb)? pa:pb) <= pc
2538 pcmpgtw mm7, mm6 // pab > pc?
2539 pxor mm1, mm1
2540 pand mm3, mm7
2541 pandn mm7, mm0
2542 paddw mm7, mm3
2543 pxor mm0, mm0
2544 packuswb mm7, mm1
2545 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2546 pand mm7, ActiveMask
2547 movq mm2, mm3 // load b=Prior(x) step 1
2548 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2549 punpcklbw mm3, mm0 // Unpack High bytes of c
2550 movq [edi + ebx], mm7 // write back updated value
2551 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2552 // Now do Paeth for 2nd set of bytes (3-5)
2553 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2554 punpcklbw mm1, mm0 // Unpack High bytes of a
2555 pxor mm7, mm7
2556 punpcklbw mm2, mm0 // Unpack High bytes of b
2557 // pbv = p - b = (a + b - c) - b = a - c
2558 movq mm5, mm1
2559 // pav = p - a = (a + b - c) - a = b - c
2560 movq mm4, mm2
2561 psubw mm5, mm3
2562 psubw mm4, mm3
2563 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2564 // pav + pbv = pbv + pav
2565 movq mm6, mm5
2566 paddw mm6, mm4
2567
2568 // pa = abs(p-a) = abs(pav)
2569 // pb = abs(p-b) = abs(pbv)
2570 // pc = abs(p-c) = abs(pcv)
2571 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2572 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2573 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2574 pand mm7, mm4 // Only pav bytes < 0 in mm7
2575 psubw mm5, mm0
2576 psubw mm4, mm7
2577 psubw mm5, mm0
2578 psubw mm4, mm7
2579 pxor mm0, mm0
2580 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2581 pand mm0, mm6 // Only pav bytes < 0 in mm7
2582 psubw mm6, mm0
2583 // test pa <= pb
2584 movq mm7, mm4
2585 psubw mm6, mm0
2586 pcmpgtw mm7, mm5 // pa > pb?
2587 movq mm0, mm7
2588 // use mm7 mask to merge pa & pb
2589 pand mm5, mm7
2590 // use mm0 mask copy to merge a & b
2591 pand mm2, mm0
2592 pandn mm7, mm4
2593 pandn mm0, mm1
2594 paddw mm7, mm5
2595 paddw mm0, mm2
2596 // test ((pa <= pb)? pa:pb) <= pc
2597 pcmpgtw mm7, mm6 // pab > pc?
2598 movq mm2, [esi + ebx] // load b=Prior(x)
2599 pand mm3, mm7
2600 pandn mm7, mm0
2601 pxor mm1, mm1
2602 paddw mm7, mm3
2603 pxor mm0, mm0
2604 packuswb mm7, mm1
2605 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2606 pand mm7, ActiveMask
2607 punpckhbw mm2, mm0 // Unpack High bytes of b
2608 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2609 // pav = p - a = (a + b - c) - a = b - c
2610 movq mm4, mm2
2611 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2612 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2613 movq [edi + ebx], mm7 // write back updated value
2614 movq mm1, mm7
2615 punpckhbw mm3, mm0 // Unpack High bytes of c
2616 psllq mm1, ShiftBpp // Shift bytes
2617 // Now mm1 will be used as Raw(x-bpp)
2618 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2619 pxor mm7, mm7
2620 punpckhbw mm1, mm0 // Unpack High bytes of a
2621 psubw mm4, mm3
2622 // pbv = p - b = (a + b - c) - b = a - c
2623 movq mm5, mm1
2624 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2625 movq mm6, mm4
2626 psubw mm5, mm3
2627 pxor mm0, mm0
2628 paddw mm6, mm5
2629
2630 // pa = abs(p-a) = abs(pav)
2631 // pb = abs(p-b) = abs(pbv)
2632 // pc = abs(p-c) = abs(pcv)
2633 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2634 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2635 pand mm0, mm4 // Only pav bytes < 0 in mm7
2636 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2637 psubw mm4, mm0
2638 psubw mm5, mm7
2639 psubw mm4, mm0
2640 psubw mm5, mm7
2641 pxor mm0, mm0
2642 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2643 pand mm0, mm6 // Only pav bytes < 0 in mm7
2644 psubw mm6, mm0
2645 // test pa <= pb
2646 movq mm7, mm4
2647 psubw mm6, mm0
2648 pcmpgtw mm7, mm5 // pa > pb?
2649 movq mm0, mm7
2650 // use mm0 mask copy to merge a & b
2651 pand mm2, mm0
2652 // use mm7 mask to merge pa & pb
2653 pand mm5, mm7
2654 pandn mm0, mm1
2655 pandn mm7, mm4
2656 paddw mm0, mm2
2657 paddw mm7, mm5
2658 // test ((pa <= pb)? pa:pb) <= pc
2659 pcmpgtw mm7, mm6 // pab > pc?
2660 pand mm3, mm7
2661 pandn mm7, mm0
2662 paddw mm7, mm3
2663 pxor mm1, mm1
2664 packuswb mm1, mm7
2665 // Step ebx to next set of 8 bytes and repeat loop til done
2666 add ebx, 8
2667 pand mm1, ActiveMaskEnd
2668 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2669
2670 cmp ebx, MMXLength
2671 pxor mm0, mm0 // pxor does not affect flags
2672 movq [edi + ebx - 8], mm1 // write back updated value
2673 // mm1 will be used as Raw(x-bpp) next loop
2674 // mm3 ready to be used as Prior(x-bpp) next loop
2675 jb dpth3lp
2676 } // end _asm block
2677 }
2678 break;
2679
2680 case 6:
2681 case 7:
2682 case 5:
2683 {
2684 ActiveMask.use = 0x00000000ffffffff;
2685 ActiveMask2.use = 0xffffffff00000000;
2686 ShiftBpp.use = bpp << 3; // == bpp * 8
2687 ShiftRem.use = 64 - ShiftBpp.use;
2688 _asm
2689 {
2690 mov ebx, diff
2691 mov edi, row
2692 mov esi, prev_row
2693 // PRIME the pump (load the first Raw(x-bpp) data set
2694 movq mm1, [edi+ebx-8]
2695 pxor mm0, mm0
2696dpth6lp:
2697 // Must shift to position Raw(x-bpp) data
2698 psrlq mm1, ShiftRem
2699 // Do first set of 4 bytes
2700 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2701 punpcklbw mm1, mm0 // Unpack Low bytes of a
2702 movq mm2, [esi + ebx] // load b=Prior(x)
2703 punpcklbw mm2, mm0 // Unpack Low bytes of b
2704 // Must shift to position Prior(x-bpp) data
2705 psrlq mm3, ShiftRem
2706 // pav = p - a = (a + b - c) - a = b - c
2707 movq mm4, mm2
2708 punpcklbw mm3, mm0 // Unpack Low bytes of c
2709 // pbv = p - b = (a + b - c) - b = a - c
2710 movq mm5, mm1
2711 psubw mm4, mm3
2712 pxor mm7, mm7
2713 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2714 movq mm6, mm4
2715 psubw mm5, mm3
2716 // pa = abs(p-a) = abs(pav)
2717 // pb = abs(p-b) = abs(pbv)
2718 // pc = abs(p-c) = abs(pcv)
2719 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2720 paddw mm6, mm5
2721 pand mm0, mm4 // Only pav bytes < 0 in mm7
2722 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2723 psubw mm4, mm0
2724 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2725 psubw mm4, mm0
2726 psubw mm5, mm7
2727 pxor mm0, mm0
2728 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2729 pand mm0, mm6 // Only pav bytes < 0 in mm7
2730 psubw mm5, mm7
2731 psubw mm6, mm0
2732 // test pa <= pb
2733 movq mm7, mm4
2734 psubw mm6, mm0
2735 pcmpgtw mm7, mm5 // pa > pb?
2736 movq mm0, mm7
2737 // use mm7 mask to merge pa & pb
2738 pand mm5, mm7
2739 // use mm0 mask copy to merge a & b
2740 pand mm2, mm0
2741 pandn mm7, mm4
2742 pandn mm0, mm1
2743 paddw mm7, mm5
2744 paddw mm0, mm2
2745 // test ((pa <= pb)? pa:pb) <= pc
2746 pcmpgtw mm7, mm6 // pab > pc?
2747 pxor mm1, mm1
2748 pand mm3, mm7
2749 pandn mm7, mm0
2750 paddw mm7, mm3
2751 pxor mm0, mm0
2752 packuswb mm7, mm1
2753 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2754 pand mm7, ActiveMask
2755 psrlq mm3, ShiftRem
2756 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2757 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2758 movq mm6, mm2
2759 movq [edi + ebx], mm7 // write back updated value
2760 movq mm1, [edi+ebx-8]
2761 psllq mm6, ShiftBpp
2762 movq mm5, mm7
2763 psrlq mm1, ShiftRem
2764 por mm3, mm6
2765 psllq mm5, ShiftBpp
2766 punpckhbw mm3, mm0 // Unpack High bytes of c
2767 por mm1, mm5
2768 // Do second set of 4 bytes
2769 punpckhbw mm2, mm0 // Unpack High bytes of b
2770 punpckhbw mm1, mm0 // Unpack High bytes of a
2771 // pav = p - a = (a + b - c) - a = b - c
2772 movq mm4, mm2
2773 // pbv = p - b = (a + b - c) - b = a - c
2774 movq mm5, mm1
2775 psubw mm4, mm3
2776 pxor mm7, mm7
2777 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2778 movq mm6, mm4
2779 psubw mm5, mm3
2780 // pa = abs(p-a) = abs(pav)
2781 // pb = abs(p-b) = abs(pbv)
2782 // pc = abs(p-c) = abs(pcv)
2783 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2784 paddw mm6, mm5
2785 pand mm0, mm4 // Only pav bytes < 0 in mm7
2786 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2787 psubw mm4, mm0
2788 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2789 psubw mm4, mm0
2790 psubw mm5, mm7
2791 pxor mm0, mm0
2792 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2793 pand mm0, mm6 // Only pav bytes < 0 in mm7
2794 psubw mm5, mm7
2795 psubw mm6, mm0
2796 // test pa <= pb
2797 movq mm7, mm4
2798 psubw mm6, mm0
2799 pcmpgtw mm7, mm5 // pa > pb?
2800 movq mm0, mm7
2801 // use mm7 mask to merge pa & pb
2802 pand mm5, mm7
2803 // use mm0 mask copy to merge a & b
2804 pand mm2, mm0
2805 pandn mm7, mm4
2806 pandn mm0, mm1
2807 paddw mm7, mm5
2808 paddw mm0, mm2
2809 // test ((pa <= pb)? pa:pb) <= pc
2810 pcmpgtw mm7, mm6 // pab > pc?
2811 pxor mm1, mm1
2812 pand mm3, mm7
2813 pandn mm7, mm0
2814 pxor mm1, mm1
2815 paddw mm7, mm3
2816 pxor mm0, mm0
2817 // Step ex to next set of 8 bytes and repeat loop til done
2818 add ebx, 8
2819 packuswb mm1, mm7
2820 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2821 cmp ebx, MMXLength
2822 movq [edi + ebx - 8], mm1 // write back updated value
2823 // mm1 will be used as Raw(x-bpp) next loop
2824 jb dpth6lp
2825 } // end _asm block
2826 }
2827 break;
2828
2829 case 4:
2830 {
2831 ActiveMask.use = 0x00000000ffffffff;
2832 _asm {
2833 mov ebx, diff
2834 mov edi, row
2835 mov esi, prev_row
2836 pxor mm0, mm0
2837 // PRIME the pump (load the first Raw(x-bpp) data set
2838 movq mm1, [edi+ebx-8] // Only time should need to read
2839 // a=Raw(x-bpp) bytes
2840dpth4lp:
2841 // Do first set of 4 bytes
2842 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2843 punpckhbw mm1, mm0 // Unpack Low bytes of a
2844 movq mm2, [esi + ebx] // load b=Prior(x)
2845 punpcklbw mm2, mm0 // Unpack High bytes of b
2846 // pav = p - a = (a + b - c) - a = b - c
2847 movq mm4, mm2
2848 punpckhbw mm3, mm0 // Unpack High bytes of c
2849 // pbv = p - b = (a + b - c) - b = a - c
2850 movq mm5, mm1
2851 psubw mm4, mm3
2852 pxor mm7, mm7
2853 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2854 movq mm6, mm4
2855 psubw mm5, mm3
2856 // pa = abs(p-a) = abs(pav)
2857 // pb = abs(p-b) = abs(pbv)
2858 // pc = abs(p-c) = abs(pcv)
2859 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2860 paddw mm6, mm5
2861 pand mm0, mm4 // Only pav bytes < 0 in mm7
2862 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2863 psubw mm4, mm0
2864 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2865 psubw mm4, mm0
2866 psubw mm5, mm7
2867 pxor mm0, mm0
2868 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2869 pand mm0, mm6 // Only pav bytes < 0 in mm7
2870 psubw mm5, mm7
2871 psubw mm6, mm0
2872 // test pa <= pb
2873 movq mm7, mm4
2874 psubw mm6, mm0
2875 pcmpgtw mm7, mm5 // pa > pb?
2876 movq mm0, mm7
2877 // use mm7 mask to merge pa & pb
2878 pand mm5, mm7
2879 // use mm0 mask copy to merge a & b
2880 pand mm2, mm0
2881 pandn mm7, mm4
2882 pandn mm0, mm1
2883 paddw mm7, mm5
2884 paddw mm0, mm2
2885 // test ((pa <= pb)? pa:pb) <= pc
2886 pcmpgtw mm7, mm6 // pab > pc?
2887 pxor mm1, mm1
2888 pand mm3, mm7
2889 pandn mm7, mm0
2890 paddw mm7, mm3
2891 pxor mm0, mm0
2892 packuswb mm7, mm1
2893 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2894 pand mm7, ActiveMask
2895 movq mm2, mm3 // load b=Prior(x) step 1
2896 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2897 punpcklbw mm3, mm0 // Unpack High bytes of c
2898 movq [edi + ebx], mm7 // write back updated value
2899 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2900 // Do second set of 4 bytes
2901 punpckhbw mm2, mm0 // Unpack Low bytes of b
2902 punpcklbw mm1, mm0 // Unpack Low bytes of a
2903 // pav = p - a = (a + b - c) - a = b - c
2904 movq mm4, mm2
2905 // pbv = p - b = (a + b - c) - b = a - c
2906 movq mm5, mm1
2907 psubw mm4, mm3
2908 pxor mm7, mm7
2909 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2910 movq mm6, mm4
2911 psubw mm5, mm3
2912 // pa = abs(p-a) = abs(pav)
2913 // pb = abs(p-b) = abs(pbv)
2914 // pc = abs(p-c) = abs(pcv)
2915 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2916 paddw mm6, mm5
2917 pand mm0, mm4 // Only pav bytes < 0 in mm7
2918 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2919 psubw mm4, mm0
2920 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2921 psubw mm4, mm0
2922 psubw mm5, mm7
2923 pxor mm0, mm0
2924 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2925 pand mm0, mm6 // Only pav bytes < 0 in mm7
2926 psubw mm5, mm7
2927 psubw mm6, mm0
2928 // test pa <= pb
2929 movq mm7, mm4
2930 psubw mm6, mm0
2931 pcmpgtw mm7, mm5 // pa > pb?
2932 movq mm0, mm7
2933 // use mm7 mask to merge pa & pb
2934 pand mm5, mm7
2935 // use mm0 mask copy to merge a & b
2936 pand mm2, mm0
2937 pandn mm7, mm4
2938 pandn mm0, mm1
2939 paddw mm7, mm5
2940 paddw mm0, mm2
2941 // test ((pa <= pb)? pa:pb) <= pc
2942 pcmpgtw mm7, mm6 // pab > pc?
2943 pxor mm1, mm1
2944 pand mm3, mm7
2945 pandn mm7, mm0
2946 pxor mm1, mm1
2947 paddw mm7, mm3
2948 pxor mm0, mm0
2949 // Step ex to next set of 8 bytes and repeat loop til done
2950 add ebx, 8
2951 packuswb mm1, mm7
2952 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2953 cmp ebx, MMXLength
2954 movq [edi + ebx - 8], mm1 // write back updated value
2955 // mm1 will be used as Raw(x-bpp) next loop
2956 jb dpth4lp
2957 } // end _asm block
2958 }
2959 break;
2960 case 8: // bpp == 8
2961 {
2962 ActiveMask.use = 0x00000000ffffffff;
2963 _asm {
2964 mov ebx, diff
2965 mov edi, row
2966 mov esi, prev_row
2967 pxor mm0, mm0
2968 // PRIME the pump (load the first Raw(x-bpp) data set
2969 movq mm1, [edi+ebx-8] // Only time should need to read
2970 // a=Raw(x-bpp) bytes
2971dpth8lp:
2972 // Do first set of 4 bytes
2973 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2974 punpcklbw mm1, mm0 // Unpack Low bytes of a
2975 movq mm2, [esi + ebx] // load b=Prior(x)
2976 punpcklbw mm2, mm0 // Unpack Low bytes of b
2977 // pav = p - a = (a + b - c) - a = b - c
2978 movq mm4, mm2
2979 punpcklbw mm3, mm0 // Unpack Low bytes of c
2980 // pbv = p - b = (a + b - c) - b = a - c
2981 movq mm5, mm1
2982 psubw mm4, mm3
2983 pxor mm7, mm7
2984 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2985 movq mm6, mm4
2986 psubw mm5, mm3
2987 // pa = abs(p-a) = abs(pav)
2988 // pb = abs(p-b) = abs(pbv)
2989 // pc = abs(p-c) = abs(pcv)
2990 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2991 paddw mm6, mm5
2992 pand mm0, mm4 // Only pav bytes < 0 in mm7
2993 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2994 psubw mm4, mm0
2995 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2996 psubw mm4, mm0
2997 psubw mm5, mm7
2998 pxor mm0, mm0
2999 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3000 pand mm0, mm6 // Only pav bytes < 0 in mm7
3001 psubw mm5, mm7
3002 psubw mm6, mm0
3003 // test pa <= pb
3004 movq mm7, mm4
3005 psubw mm6, mm0
3006 pcmpgtw mm7, mm5 // pa > pb?
3007 movq mm0, mm7
3008 // use mm7 mask to merge pa & pb
3009 pand mm5, mm7
3010 // use mm0 mask copy to merge a & b
3011 pand mm2, mm0
3012 pandn mm7, mm4
3013 pandn mm0, mm1
3014 paddw mm7, mm5
3015 paddw mm0, mm2
3016 // test ((pa <= pb)? pa:pb) <= pc
3017 pcmpgtw mm7, mm6 // pab > pc?
3018 pxor mm1, mm1
3019 pand mm3, mm7
3020 pandn mm7, mm0
3021 paddw mm7, mm3
3022 pxor mm0, mm0
3023 packuswb mm7, mm1
3024 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3025 pand mm7, ActiveMask
3026 movq mm2, [esi + ebx] // load b=Prior(x)
3027 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3028 punpckhbw mm3, mm0 // Unpack High bytes of c
3029 movq [edi + ebx], mm7 // write back updated value
3030 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3031
3032 // Do second set of 4 bytes
3033 punpckhbw mm2, mm0 // Unpack High bytes of b
3034 punpckhbw mm1, mm0 // Unpack High bytes of a
3035 // pav = p - a = (a + b - c) - a = b - c
3036 movq mm4, mm2
3037 // pbv = p - b = (a + b - c) - b = a - c
3038 movq mm5, mm1
3039 psubw mm4, mm3
3040 pxor mm7, mm7
3041 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3042 movq mm6, mm4
3043 psubw mm5, mm3
3044 // pa = abs(p-a) = abs(pav)
3045 // pb = abs(p-b) = abs(pbv)
3046 // pc = abs(p-c) = abs(pcv)
3047 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3048 paddw mm6, mm5
3049 pand mm0, mm4 // Only pav bytes < 0 in mm7
3050 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3051 psubw mm4, mm0
3052 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3053 psubw mm4, mm0
3054 psubw mm5, mm7
3055 pxor mm0, mm0
3056 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3057 pand mm0, mm6 // Only pav bytes < 0 in mm7
3058 psubw mm5, mm7
3059 psubw mm6, mm0
3060 // test pa <= pb
3061 movq mm7, mm4
3062 psubw mm6, mm0
3063 pcmpgtw mm7, mm5 // pa > pb?
3064 movq mm0, mm7
3065 // use mm7 mask to merge pa & pb
3066 pand mm5, mm7
3067 // use mm0 mask copy to merge a & b
3068 pand mm2, mm0
3069 pandn mm7, mm4
3070 pandn mm0, mm1
3071 paddw mm7, mm5
3072 paddw mm0, mm2
3073 // test ((pa <= pb)? pa:pb) <= pc
3074 pcmpgtw mm7, mm6 // pab > pc?
3075 pxor mm1, mm1
3076 pand mm3, mm7
3077 pandn mm7, mm0
3078 pxor mm1, mm1
3079 paddw mm7, mm3
3080 pxor mm0, mm0
3081 // Step ex to next set of 8 bytes and repeat loop til done
3082 add ebx, 8
3083 packuswb mm1, mm7
3084 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3085 cmp ebx, MMXLength
3086 movq [edi + ebx - 8], mm1 // write back updated value
3087 // mm1 will be used as Raw(x-bpp) next loop
3088 jb dpth8lp
3089 } // end _asm block
3090 }
3091 break;
3092
3093 case 1: // bpp = 1
3094 case 2: // bpp = 2
3095 default: // bpp > 8
3096 {
3097 _asm {
3098 mov ebx, diff
3099 cmp ebx, FullLength
3100 jnb dpthdend
3101 mov edi, row
3102 mov esi, prev_row
3103 // Do Paeth decode for remaining bytes
3104 mov edx, ebx
3105 xor ecx, ecx // zero ecx before using cl & cx in loop below
3106 sub edx, bpp // Set edx = ebx - bpp
3107dpthdlp:
3108 xor eax, eax
3109 // pav = p - a = (a + b - c) - a = b - c
3110 mov al, [esi + ebx] // load Prior(x) into al
3111 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3112 sub eax, ecx // subtract Prior(x-bpp)
3113 mov patemp, eax // Save pav for later use
3114 xor eax, eax
3115 // pbv = p - b = (a + b - c) - b = a - c
3116 mov al, [edi + edx] // load Raw(x-bpp) into al
3117 sub eax, ecx // subtract Prior(x-bpp)
3118 mov ecx, eax
3119 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3120 add eax, patemp // pcv = pav + pbv
3121 // pc = abs(pcv)
3122 test eax, 0x80000000
3123 jz dpthdpca
3124 neg eax // reverse sign of neg values
3125dpthdpca:
3126 mov pctemp, eax // save pc for later use
3127 // pb = abs(pbv)
3128 test ecx, 0x80000000
3129 jz dpthdpba
3130 neg ecx // reverse sign of neg values
3131dpthdpba:
3132 mov pbtemp, ecx // save pb for later use
3133 // pa = abs(pav)
3134 mov eax, patemp
3135 test eax, 0x80000000
3136 jz dpthdpaa
3137 neg eax // reverse sign of neg values
3138dpthdpaa:
3139 mov patemp, eax // save pa for later use
3140 // test if pa <= pb
3141 cmp eax, ecx
3142 jna dpthdabb
3143 // pa > pb; now test if pb <= pc
3144 cmp ecx, pctemp
3145 jna dpthdbbc
3146 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3147 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3148 jmp dpthdpaeth
3149dpthdbbc:
3150 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3151 mov cl, [esi + ebx] // load Prior(x) into cl
3152 jmp dpthdpaeth
3153dpthdabb:
3154 // pa <= pb; now test if pa <= pc
3155 cmp eax, pctemp
3156 jna dpthdabc
3157 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3158 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3159 jmp dpthdpaeth
3160dpthdabc:
3161 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3162 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3163dpthdpaeth:
3164 inc ebx
3165 inc edx
3166 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3167 add [edi + ebx - 1], cl
3168 cmp ebx, FullLength
3169 jb dpthdlp
3170dpthdend:
3171 } // end _asm block
3172 }
3173 return; // No need to go further with this one
3174 } // end switch ( bpp )
3175 _asm
3176 {
3177 // MMX acceleration complete now do clean-up
3178 // Check if any remaining bytes left to decode
3179 mov ebx, MMXLength
3180 cmp ebx, FullLength
3181 jnb dpthend
3182 mov edi, row
3183 mov esi, prev_row
3184 // Do Paeth decode for remaining bytes
3185 mov edx, ebx
3186 xor ecx, ecx // zero ecx before using cl & cx in loop below
3187 sub edx, bpp // Set edx = ebx - bpp
3188dpthlp2:
3189 xor eax, eax
3190 // pav = p - a = (a + b - c) - a = b - c
3191 mov al, [esi + ebx] // load Prior(x) into al
3192 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3193 sub eax, ecx // subtract Prior(x-bpp)
3194 mov patemp, eax // Save pav for later use
3195 xor eax, eax
3196 // pbv = p - b = (a + b - c) - b = a - c
3197 mov al, [edi + edx] // load Raw(x-bpp) into al
3198 sub eax, ecx // subtract Prior(x-bpp)
3199 mov ecx, eax
3200 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3201 add eax, patemp // pcv = pav + pbv
3202 // pc = abs(pcv)
3203 test eax, 0x80000000
3204 jz dpthpca2
3205 neg eax // reverse sign of neg values
3206dpthpca2:
3207 mov pctemp, eax // save pc for later use
3208 // pb = abs(pbv)
3209 test ecx, 0x80000000
3210 jz dpthpba2
3211 neg ecx // reverse sign of neg values
3212dpthpba2:
3213 mov pbtemp, ecx // save pb for later use
3214 // pa = abs(pav)
3215 mov eax, patemp
3216 test eax, 0x80000000
3217 jz dpthpaa2
3218 neg eax // reverse sign of neg values
3219dpthpaa2:
3220 mov patemp, eax // save pa for later use
3221 // test if pa <= pb
3222 cmp eax, ecx
3223 jna dpthabb2
3224 // pa > pb; now test if pb <= pc
3225 cmp ecx, pctemp
3226 jna dpthbbc2
3227 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3229 jmp dpthpaeth2
3230dpthbbc2:
3231 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3232 mov cl, [esi + ebx] // load Prior(x) into cl
3233 jmp dpthpaeth2
3234dpthabb2:
3235 // pa <= pb; now test if pa <= pc
3236 cmp eax, pctemp
3237 jna dpthabc2
3238 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3239 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3240 jmp dpthpaeth2
3241dpthabc2:
3242 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3243 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3244dpthpaeth2:
3245 inc ebx
3246 inc edx
3247 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3248 add [edi + ebx - 1], cl
3249 cmp ebx, FullLength
3250 jb dpthlp2
3251dpthend:
3252 emms // End MMX instructions; prep for possible FP instrs.
3253 } // end _asm block
3254}
3255
3256// Optimized code for PNG Sub filter decoder
3257void /* PRIVATE */
3258png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3259{
3260 //int test;
3261 int bpp;
3262 png_uint_32 FullLength;
3263 png_uint_32 MMXLength;
3264 int diff;
3265
3266 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3267 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3268 _asm {
3269 mov edi, row
3270 mov esi, edi // lp = row
3271 add edi, bpp // rp = row + bpp
3272 xor eax, eax
3273 // get # of bytes to alignment
3274 mov diff, edi // take start of row
3275 add diff, 0xf // add 7 + 8 to incr past
3276 // alignment boundary
3277 xor ebx, ebx
3278 and diff, 0xfffffff8 // mask to alignment boundary
3279 sub diff, edi // subtract from start ==> value
3280 // ebx at alignment
3281 jz dsubgo
3282 // fix alignment
3283dsublp1:
3284 mov al, [esi+ebx]
3285 add [edi+ebx], al
3286 inc ebx
3287 cmp ebx, diff
3288 jb dsublp1
3289dsubgo:
3290 mov ecx, FullLength
3291 mov edx, ecx
3292 sub edx, ebx // subtract alignment fix
3293 and edx, 0x00000007 // calc bytes over mult of 8
3294 sub ecx, edx // drop over bytes from length
3295 mov MMXLength, ecx
3296 } // end _asm block
3297
3298 // Now do the math for the rest of the row
3299 switch ( bpp )
3300 {
3301 case 3:
3302 {
3303 ActiveMask.use = 0x0000ffffff000000;
3304 ShiftBpp.use = 24; // == 3 * 8
3305 ShiftRem.use = 40; // == 64 - 24
3306 _asm {
3307 mov edi, row
3308 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3309 mov esi, edi // lp = row
3310 add edi, bpp // rp = row + bpp
3311 movq mm6, mm7
3312 mov ebx, diff
3313 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3314 // byte group
3315 // PRIME the pump (load the first Raw(x-bpp) data set
3316 movq mm1, [edi+ebx-8]
3317dsub3lp:
3318 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3319 // no need for mask; shift clears inactive bytes
3320 // Add 1st active group
3321 movq mm0, [edi+ebx]
3322 paddb mm0, mm1
3323 // Add 2nd active group
3324 movq mm1, mm0 // mov updated Raws to mm1
3325 psllq mm1, ShiftBpp // shift data to position correctly
3326 pand mm1, mm7 // mask to use only 2nd active group
3327 paddb mm0, mm1
3328 // Add 3rd active group
3329 movq mm1, mm0 // mov updated Raws to mm1
3330 psllq mm1, ShiftBpp // shift data to position correctly
3331 pand mm1, mm6 // mask to use only 3rd active group
3332 add ebx, 8
3333 paddb mm0, mm1
3334 cmp ebx, MMXLength
3335 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3336 // Prep for doing 1st add at top of loop
3337 movq mm1, mm0
3338 jb dsub3lp
3339 } // end _asm block
3340 }
3341 break;
3342
3343 case 1:
3344 {
3345 // Placed here just in case this is a duplicate of the
3346 // non-MMX code for the SUB filter in png_read_filter_row below
3347 //
3348 // png_bytep rp;
3349 // png_bytep lp;
3350 // png_uint_32 i;
3351 // bpp = (row_info->pixel_depth + 7) >> 3;
3352 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3353 // i < row_info->rowbytes; i++, rp++, lp++)
3354 // {
3355 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3356 // }
3357 _asm {
3358 mov ebx, diff
3359 mov edi, row
3360 cmp ebx, FullLength
3361 jnb dsub1end
3362 mov esi, edi // lp = row
3363 xor eax, eax
3364 add edi, bpp // rp = row + bpp
3365dsub1lp:
3366 mov al, [esi+ebx]
3367 add [edi+ebx], al
3368 inc ebx
3369 cmp ebx, FullLength
3370 jb dsub1lp
3371dsub1end:
3372 } // end _asm block
3373 }
3374 return;
3375
3376 case 6:
3377 case 7:
3378 case 4:
3379 case 5:
3380 {
3381 ShiftBpp.use = bpp << 3;
3382 ShiftRem.use = 64 - ShiftBpp.use;
3383 _asm {
3384 mov edi, row
3385 mov ebx, diff
3386 mov esi, edi // lp = row
3387 add edi, bpp // rp = row + bpp
3388 // PRIME the pump (load the first Raw(x-bpp) data set
3389 movq mm1, [edi+ebx-8]
3390dsub4lp:
3391 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3392 // no need for mask; shift clears inactive bytes
3393 movq mm0, [edi+ebx]
3394 paddb mm0, mm1
3395 // Add 2nd active group
3396 movq mm1, mm0 // mov updated Raws to mm1
3397 psllq mm1, ShiftBpp // shift data to position correctly
3398 // there is no need for any mask
3399 // since shift clears inactive bits/bytes
3400 add ebx, 8
3401 paddb mm0, mm1
3402 cmp ebx, MMXLength
3403 movq [edi+ebx-8], mm0
3404 movq mm1, mm0 // Prep for doing 1st add at top of loop
3405 jb dsub4lp
3406 } // end _asm block
3407 }
3408 break;
3409
3410 case 2:
3411 {
3412 ActiveMask.use = 0x00000000ffff0000;
3413 ShiftBpp.use = 16; // == 2 * 8
3414 ShiftRem.use = 48; // == 64 - 16
3415 _asm {
3416 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3417 mov ebx, diff
3418 movq mm6, mm7
3419 mov edi, row
3420 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3421 // byte group
3422 mov esi, edi // lp = row
3423 movq mm5, mm6
3424 add edi, bpp // rp = row + bpp
3425 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3426 // byte group
3427 // PRIME the pump (load the first Raw(x-bpp) data set
3428 movq mm1, [edi+ebx-8]
3429dsub2lp:
3430 // Add 1st active group
3431 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3432 // no need for mask; shift clears inactive
3433 // bytes
3434 movq mm0, [edi+ebx]
3435 paddb mm0, mm1
3436 // Add 2nd active group
3437 movq mm1, mm0 // mov updated Raws to mm1
3438 psllq mm1, ShiftBpp // shift data to position correctly
3439 pand mm1, mm7 // mask to use only 2nd active group
3440 paddb mm0, mm1
3441 // Add 3rd active group
3442 movq mm1, mm0 // mov updated Raws to mm1
3443 psllq mm1, ShiftBpp // shift data to position correctly
3444 pand mm1, mm6 // mask to use only 3rd active group
3445 paddb mm0, mm1
3446 // Add 4th active group
3447 movq mm1, mm0 // mov updated Raws to mm1
3448 psllq mm1, ShiftBpp // shift data to position correctly
3449 pand mm1, mm5 // mask to use only 4th active group
3450 add ebx, 8
3451 paddb mm0, mm1
3452 cmp ebx, MMXLength
3453 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3454 movq mm1, mm0 // Prep for doing 1st add at top of loop
3455 jb dsub2lp
3456 } // end _asm block
3457 }
3458 break;
3459 case 8:
3460 {
3461 _asm {
3462 mov edi, row
3463 mov ebx, diff
3464 mov esi, edi // lp = row
3465 add edi, bpp // rp = row + bpp
3466 mov ecx, MMXLength
3467 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3468 // Raw(x-bpp) data set
3469 and ecx, 0x0000003f // calc bytes over mult of 64
3470dsub8lp:
3471 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3472 paddb mm0, mm7
3473 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3474 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3475 // Now mm0 will be used as Raw(x-bpp) for
3476 // the 2nd group of 8 bytes. This will be
3477 // repeated for each group of 8 bytes with
3478 // the 8th group being used as the Raw(x-bpp)
3479 // for the 1st group of the next loop.
3480 paddb mm1, mm0
3481 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3482 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3483 paddb mm2, mm1
3484 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3485 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3486 paddb mm3, mm2
3487 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3488 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3489 paddb mm4, mm3
3490 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3491 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3492 paddb mm5, mm4
3493 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3494 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3495 paddb mm6, mm5
3496 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3497 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3498 add ebx, 64
3499 paddb mm7, mm6
3500 cmp ebx, ecx
3501 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3502 jb dsub8lp
3503 cmp ebx, MMXLength
3504 jnb dsub8lt8
3505dsub8lpA:
3506 movq mm0, [edi+ebx]
3507 add ebx, 8
3508 paddb mm0, mm7
3509 cmp ebx, MMXLength
3510 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3511 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3512 // be the new Raw(x-bpp) for the next loop
3513 jb dsub8lpA
3514dsub8lt8:
3515 } // end _asm block
3516 }
3517 break;
3518
3519 default: // bpp greater than 8 bytes
3520 {
3521 _asm {
3522 mov ebx, diff
3523 mov edi, row
3524 mov esi, edi // lp = row
3525 add edi, bpp // rp = row + bpp
3526dsubAlp:
3527 movq mm0, [edi+ebx]
3528 movq mm1, [esi+ebx]
3529 add ebx, 8
3530 paddb mm0, mm1
3531 cmp ebx, MMXLength
3532 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3533 // add ebx
3534 jb dsubAlp
3535 } // end _asm block
3536 }
3537 break;
3538
3539 } // end switch ( bpp )
3540
3541 _asm {
3542 mov ebx, MMXLength
3543 mov edi, row
3544 cmp ebx, FullLength
3545 jnb dsubend
3546 mov esi, edi // lp = row
3547 xor eax, eax
3548 add edi, bpp // rp = row + bpp
3549dsublp2:
3550 mov al, [esi+ebx]
3551 add [edi+ebx], al
3552 inc ebx
3553 cmp ebx, FullLength
3554 jb dsublp2
3555dsubend:
3556 emms // End MMX instructions; prep for possible FP instrs.
3557 } // end _asm block
3558}
3559
3560// Optimized code for PNG Up filter decoder
3561void /* PRIVATE */
3562png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3563 png_bytep prev_row)
3564{
3565 png_uint_32 len;
3566 len = row_info->rowbytes; // # of bytes to filter
3567 _asm {
3568 mov edi, row
3569 // get # of bytes to alignment
3570 mov ecx, edi
3571 xor ebx, ebx
3572 add ecx, 0x7
3573 xor eax, eax
3574 and ecx, 0xfffffff8
3575 mov esi, prev_row
3576 sub ecx, edi
3577 jz dupgo
3578 // fix alignment
3579duplp1:
3580 mov al, [edi+ebx]
3581 add al, [esi+ebx]
3582 inc ebx
3583 cmp ebx, ecx
3584 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3585 jb duplp1
3586dupgo:
3587 mov ecx, len
3588 mov edx, ecx
3589 sub edx, ebx // subtract alignment fix
3590 and edx, 0x0000003f // calc bytes over mult of 64
3591 sub ecx, edx // drop over bytes from length
3592 // Unrolled loop - use all MMX registers and interleave to reduce
3593 // number of branch instructions (loops) and reduce partial stalls
3594duploop:
3595 movq mm1, [esi+ebx]
3596 movq mm0, [edi+ebx]
3597 movq mm3, [esi+ebx+8]
3598 paddb mm0, mm1
3599 movq mm2, [edi+ebx+8]
3600 movq [edi+ebx], mm0
3601 paddb mm2, mm3
3602 movq mm5, [esi+ebx+16]
3603 movq [edi+ebx+8], mm2
3604 movq mm4, [edi+ebx+16]
3605 movq mm7, [esi+ebx+24]
3606 paddb mm4, mm5
3607 movq mm6, [edi+ebx+24]
3608 movq [edi+ebx+16], mm4
3609 paddb mm6, mm7
3610 movq mm1, [esi+ebx+32]
3611 movq [edi+ebx+24], mm6
3612 movq mm0, [edi+ebx+32]
3613 movq mm3, [esi+ebx+40]
3614 paddb mm0, mm1
3615 movq mm2, [edi+ebx+40]
3616 movq [edi+ebx+32], mm0
3617 paddb mm2, mm3
3618 movq mm5, [esi+ebx+48]
3619 movq [edi+ebx+40], mm2
3620 movq mm4, [edi+ebx+48]
3621 movq mm7, [esi+ebx+56]
3622 paddb mm4, mm5
3623 movq mm6, [edi+ebx+56]
3624 movq [edi+ebx+48], mm4
3625 add ebx, 64
3626 paddb mm6, mm7
3627 cmp ebx, ecx
3628 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3629 // -8 to offset add ebx
3630 jb duploop
3631
3632 cmp edx, 0 // Test for bytes over mult of 64
3633 jz dupend
3634
3635
3636 // 2 lines added by lcreeve at netins.net
3637 // (mail 11 Jul 98 in png-implement list)
3638 cmp edx, 8 //test for less than 8 bytes
3639 jb duplt8
3640
3641
3642 add ecx, edx
3643 and edx, 0x00000007 // calc bytes over mult of 8
3644 sub ecx, edx // drop over bytes from length
3645 jz duplt8
3646 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3647duplpA:
3648 movq mm1, [esi+ebx]
3649 movq mm0, [edi+ebx]
3650 add ebx, 8
3651 paddb mm0, mm1
3652 cmp ebx, ecx
3653 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3654 jb duplpA
3655 cmp edx, 0 // Test for bytes over mult of 8
3656 jz dupend
3657duplt8:
3658 xor eax, eax
3659 add ecx, edx // move over byte count into counter
3660 // Loop using x86 registers to update remaining bytes
3661duplp2:
3662 mov al, [edi + ebx]
3663 add al, [esi + ebx]
3664 inc ebx
3665 cmp ebx, ecx
3666 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3667 jb duplp2
3668dupend:
3669 // Conversion of filtered row completed
3670 emms // End MMX instructions; prep for possible FP instrs.
3671 } // end _asm block
3672}
3673
3674
3675// Optimized png_read_filter_row routines
3676void /* PRIVATE */
3677png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3678 row, png_bytep prev_row, int filter)
3679{
3680#ifdef PNG_DEBUG
3681 char filnm[10];
3682#endif
3683
3684 if (mmx_supported == 2) {
3685#if !defined(PNG_1_0_X)
3686 /* this should have happened in png_init_mmx_flags() already */
3687 png_warning(png_ptr, "asm_flags may not have been initialized");
3688#endif
3689 png_mmx_support();
3690 }
3691
3692#ifdef PNG_DEBUG
3693 png_debug(1, "in png_read_filter_row\n");
3694 switch (filter)
3695 {
3696 case 0: sprintf(filnm, "none");
3697 break;
3698#if !defined(PNG_1_0_X)
3699 case 1: sprintf(filnm, "sub-%s",
3700 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3701 break;
3702 case 2: sprintf(filnm, "up-%s",
3703 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3704 break;
3705 case 3: sprintf(filnm, "avg-%s",
3706 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3707 break;
3708 case 4: sprintf(filnm, "Paeth-%s",
3709 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3710 break;
3711#else
3712 case 1: sprintf(filnm, "sub");
3713 break;
3714 case 2: sprintf(filnm, "up");
3715 break;
3716 case 3: sprintf(filnm, "avg");
3717 break;
3718 case 4: sprintf(filnm, "Paeth");
3719 break;
3720#endif
3721 default: sprintf(filnm, "unknw");
3722 break;
3723 }
3724 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3725 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3726 (int)((row_info->pixel_depth + 7) >> 3));
3727 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3728#endif /* PNG_DEBUG */
3729
3730 switch (filter)
3731 {
3732 case PNG_FILTER_VALUE_NONE:
3733 break;
3734
3735 case PNG_FILTER_VALUE_SUB:
3736 {
3737#if !defined(PNG_1_0_X)
3738 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3739 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3740 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3741#else
3742 if (mmx_supported)
3743#endif
3744 {
3745 png_read_filter_row_mmx_sub(row_info, row);
3746 }
3747 else
3748 {
3749 png_uint_32 i;
3750 png_uint_32 istop = row_info->rowbytes;
3751 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3752 png_bytep rp = row + bpp;
3753 png_bytep lp = row;
3754
3755 for (i = bpp; i < istop; i++)
3756 {
3757 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3758 rp++;
3759 }
3760 }
3761 break;
3762 }
3763
3764 case PNG_FILTER_VALUE_UP:
3765 {
3766#if !defined(PNG_1_0_X)
3767 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3768 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3769 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3770#else
3771 if (mmx_supported)
3772#endif
3773 {
3774 png_read_filter_row_mmx_up(row_info, row, prev_row);
3775 }
3776 else
3777 {
3778 png_uint_32 i;
3779 png_uint_32 istop = row_info->rowbytes;
3780 png_bytep rp = row;
3781 png_bytep pp = prev_row;
3782
3783 for (i = 0; i < istop; ++i)
3784 {
3785 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3786 rp++;
3787 }
3788 }
3789 break;
3790 }
3791
3792 case PNG_FILTER_VALUE_AVG:
3793 {
3794#if !defined(PNG_1_0_X)
3795 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3796 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3797 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3798#else
3799 if (mmx_supported)
3800#endif
3801 {
3802 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3803 }
3804 else
3805 {
3806 png_uint_32 i;
3807 png_bytep rp = row;
3808 png_bytep pp = prev_row;
3809 png_bytep lp = row;
3810 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3811 png_uint_32 istop = row_info->rowbytes - bpp;
3812
3813 for (i = 0; i < bpp; i++)
3814 {
3815 *rp = (png_byte)(((int)(*rp) +
3816 ((int)(*pp++) >> 1)) & 0xff);
3817 rp++;
3818 }
3819
3820 for (i = 0; i < istop; i++)
3821 {
3822 *rp = (png_byte)(((int)(*rp) +
3823 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3824 rp++;
3825 }
3826 }
3827 break;
3828 }
3829
3830 case PNG_FILTER_VALUE_PAETH:
3831 {
3832#if !defined(PNG_1_0_X)
3833 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3834 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3835 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3836#else
3837 if (mmx_supported)
3838#endif
3839 {
3840 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3841 }
3842 else
3843 {
3844 png_uint_32 i;
3845 png_bytep rp = row;
3846 png_bytep pp = prev_row;
3847 png_bytep lp = row;
3848 png_bytep cp = prev_row;
3849 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3850 png_uint_32 istop=row_info->rowbytes - bpp;
3851
3852 for (i = 0; i < bpp; i++)
3853 {
3854 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3855 rp++;
3856 }
3857
3858 for (i = 0; i < istop; i++) // use leftover rp,pp
3859 {
3860 int a, b, c, pa, pb, pc, p;
3861
3862 a = *lp++;
3863 b = *pp++;
3864 c = *cp++;
3865
3866 p = b - c;
3867 pc = a - c;
3868
3869#ifdef PNG_USE_ABS
3870 pa = abs(p);
3871 pb = abs(pc);
3872 pc = abs(p + pc);
3873#else
3874 pa = p < 0 ? -p : p;
3875 pb = pc < 0 ? -pc : pc;
3876 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3877#endif
3878
3879 /*
3880 if (pa <= pb && pa <= pc)
3881 p = a;
3882 else if (pb <= pc)
3883 p = b;
3884 else
3885 p = c;
3886 */
3887
3888 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3889
3890 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3891 rp++;
3892 }
3893 }
3894 break;
3895 }
3896
3897 default:
3898 png_warning(png_ptr, "Ignoring bad row filter type");
3899 *row=0;
3900 break;
3901 }
3902}
3903
3904#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */
Note: See TracBrowser for help on using the repository browser.