source: liacs/MIR2010/SourceCode/cximage/png/pnggccrd.c@ 209

Last change on this file since 209 was 95, checked in by Rick van der Zwet, 15 years ago

Bad boy, improper move of directory

File size: 235.1 KB
Line 
1
2/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 *
4 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5 *
6 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
7 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
8 * for Intel's performance analysis of the MMX vs. non-MMX code.
9 *
10 * Last changed in libpng 1.2.15 January 5, 2007
11 * For conditions of distribution and use, see copyright notice in png.h
12 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
13 * Copyright (c) 1998, Intel Corporation
14 *
15 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
16 * Interface to libpng contributed by Gilles Vollant, 1999.
17 * GNU C port by Greg Roelofs, 1999-2001.
18 *
19 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
20 *
21 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22 *
23 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24 *
25 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
26 * is required to assemble the newer MMX instructions such as movq.
27 * For djgpp, see
28 *
29 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30 *
31 * (or a later version in the same directory). For Linux, check your
32 * distribution's web site(s) or try these links:
33 *
34 * http://rufus.w3.org/linux/RPM/binutils.html
35 * http://www.debian.org/Packages/stable/devel/binutils.html
36 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
37 * binutils.tgz
38 *
39 * For other platforms, see the main GNU site:
40 *
41 * ftp://ftp.gnu.org/pub/gnu/binutils/
42 *
43 * Version 2.5.2l.15 is definitely too old...
44 */
45
46/*
47 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
48 * =====================================
49 *
50 * 19991006:
51 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
52 *
53 * 19991007:
54 * - additional optimizations (possible or definite):
55 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
56 * - write MMX code for 48-bit case (pixel_bytes == 6)
57 * - figure out what's up with 24-bit case (pixel_bytes == 3):
58 * why subtract 8 from width_mmx in the pass 4/5 case?
59 * (only width_mmx case) (near line 1606)
60 * x [DONE] replace pixel_bytes within each block with the true
61 * constant value (or are compilers smart enough to do that?)
62 * - rewrite all MMX interlacing code so it's aligned with
63 * the *beginning* of the row buffer, not the end. This
64 * would not only allow one to eliminate half of the memory
65 * writes for odd passes (that is, pass == odd), it may also
66 * eliminate some unaligned-data-access exceptions (assuming
67 * there's a penalty for not aligning 64-bit accesses on
68 * 64-bit boundaries). The only catch is that the "leftover"
69 * pixel(s) at the end of the row would have to be saved,
70 * but there are enough unused MMX registers in every case,
71 * so this is not a problem. A further benefit is that the
72 * post-MMX cleanup code (C code) in at least some of the
73 * cases could be done within the assembler block.
74 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
75 * inconsistent, and don't match the MMX Programmer's Reference
76 * Manual conventions anyway. They should be changed to
77 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
78 * was lowest in memory (e.g., corresponding to a left pixel)
79 * and b7 is the byte that was highest (e.g., a right pixel).
80 *
81 * 19991016:
82 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
83 * want globals prefixed by underscores when referencing them--
84 * i.e., if the variable is const4, then refer to it as const4,
85 * not _const4. This seems to be a djgpp-specific requirement.
86 * Also, such variables apparently *must* be declared outside
87 * of functions; neither static nor automatic variables work if
88 * defined within the scope of a single function, but both
89 * static and truly global (multi-module) variables work fine.
90 *
91 * 19991023:
92 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
93 * - switched from string-concatenation-with-macros to cleaner method of
94 * renaming global variables for djgpp--i.e., always use prefixes in
95 * inlined assembler code (== strings) and conditionally rename the
96 * variables, not the other way around. Hence _const4, _mask8_0, etc.
97 *
98 * 19991024:
99 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
100 * This one was severely weird: even though mmxsupport() doesn't touch
101 * ebx (where "row" pointer was stored), it nevertheless managed to zero
102 * the register (even in static/non-fPIC code--see below), which in turn
103 * caused png_do_read_interlace() to return prematurely on the first row of
104 * interlaced images (i.e., without expanding the interlaced pixels).
105 * Inspection of the generated assembly code didn't turn up any clues,
106 * although it did point at a minor optimization (i.e., get rid of
107 * mmx_supported_local variable and just use eax). Possibly the CPUID
108 * instruction is more destructive than it looks? (Not yet checked.)
109 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
110 * listings... Apparently register spillage has to do with ebx, since
111 * it's used to index the global offset table. Commenting it out of the
112 * input-reg lists in png_combine_row() eliminated compiler barfage, so
113 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
114 *
115 * 19991107:
116 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
117 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
118 *
119 * 19991120:
120 * - made "diff" variable (now "_dif") global to simplify conversion of
121 * filtering routines (running out of regs, sigh). "diff" is still used
122 * in interlacing routines, however.
123 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
124 * macro determines which is used); original not yet tested.
125 *
126 * 20000213:
127 * - when compiling with gcc, be sure to use -fomit-frame-pointer
128 *
129 * 20000319:
130 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
131 * pass == 4 or 5, that caused visible corruption of interlaced images
132 *
133 * 20000623:
134 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
135 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
136 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
137 * Chuck Wilson supplied a patch involving dummy output registers. See
138 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
139 * for the original (anonymous) SourceForge bug report.
140 *
141 * 20000706:
142 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
143 * pnggccrd.c: In function `png_combine_row':
144 * pnggccrd.c:525: more than 10 operands in `asm'
145 * pnggccrd.c:669: more than 10 operands in `asm'
146 * pnggccrd.c:828: more than 10 operands in `asm'
147 * pnggccrd.c:994: more than 10 operands in `asm'
148 * pnggccrd.c:1177: more than 10 operands in `asm'
149 * They are all the same problem and can be worked around by using the
150 * global _unmask variable unconditionally, not just in the -fPIC case.
151 * Reportedly earlier versions of gcc also have the problem with more than
152 * 10 operands; they just don't report it. Much strangeness ensues, etc.
153 *
154 * 20000729:
155 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
156 * MMX routine); began converting png_read_filter_row_mmx_sub()
157 * - to finish remaining sections:
158 * - clean up indentation and comments
159 * - preload local variables
160 * - add output and input regs (order of former determines numerical
161 * mapping of latter)
162 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
163 * - remove "$" from addressing of Shift and Mask variables [20000823]
164 *
165 * 20000731:
166 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
167 *
168 * 20000822:
169 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
170 * shared-library (-fPIC) version! Code works just fine as part of static
171 * library. Damn damn damn damn damn, should have tested that sooner.
172 * ebx is getting clobbered again (explicitly this time); need to save it
173 * on stack or rewrite asm code to avoid using it altogether. Blargh!
174 *
175 * 20000823:
176 * - first section was trickiest; all remaining sections have ebx -> edx now.
177 * (-fPIC works again.) Also added missing underscores to various Shift*
178 * and *Mask* globals and got rid of leading "$" signs.
179 *
180 * 20000826:
181 * - added visual separators to help navigate microscopic printed copies
182 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
183 * on png_read_filter_row_mmx_avg()
184 *
185 * 20000828:
186 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
187 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
188 * cleaned up/shortened in either routine, but functionality is complete
189 * and seems to be working fine.
190 *
191 * 20000829:
192 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
193 * as an input reg (with dummy output variables, etc.), then it *cannot*
194 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
195 * is simple enough...
196 *
197 * 20000914:
198 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
199 * correctly (but 48-bit RGB just fine)
200 *
201 * 20000916:
202 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
203 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
204 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
205 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
206 *
207 * 20010101:
208 * - added new png_init_mmx_flags() function (here only because it needs to
209 * call mmxsupport(), which should probably become global png_mmxsupport());
210 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
211 *
212 * 20010103:
213 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
214 * and made it public; moved png_init_mmx_flags() to png.c as internal func
215 *
216 * 20010104:
217 * - removed dependency on png_read_filter_row_c() (C code already duplicated
218 * within MMX version of png_read_filter_row()) so no longer necessary to
219 * compile it into pngrutil.o
220 *
221 * 20010310:
222 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
223 *
224 * 20020304:
225 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
226 *
227 * 20040724:
228 * - more tinkering with clobber list at lines 4529 and 5033, to get
229 * it to compile on gcc-3.4.
230 *
231 * STILL TO DO:
232 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
233 * - write MMX code for 48-bit case (pixel_bytes == 6)
234 * - figure out what's up with 24-bit case (pixel_bytes == 3):
235 * why subtract 8 from width_mmx in the pass 4/5 case?
236 * (only width_mmx case) (near line 1606)
237 * - rewrite all MMX interlacing code so it's aligned with beginning
238 * of the row buffer, not the end (see 19991007 for details)
239 * x pick one version of mmxsupport() and get rid of the other
240 * - add error messages to any remaining bogus default cases
241 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
242 * x add support for runtime enable/disable/query of various MMX routines
243 */
244
245#define PNG_INTERNAL
246#include "png.h"
247
248#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
249
250int PNGAPI png_mmx_support(void);
251
252#ifdef PNG_USE_LOCAL_ARRAYS
253const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
254const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
255const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
256#endif
257
258#if defined(PNG_MMX_CODE_SUPPORTED)
259/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
260 * so define them without: */
261#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
262 defined(__OS2__)
263# define _mmx_supported mmx_supported
264# define _const4 const4
265# define _const6 const6
266# define _mask8_0 mask8_0
267# define _mask16_1 mask16_1
268# define _mask16_0 mask16_0
269# define _mask24_2 mask24_2
270# define _mask24_1 mask24_1
271# define _mask24_0 mask24_0
272# define _mask32_3 mask32_3
273# define _mask32_2 mask32_2
274# define _mask32_1 mask32_1
275# define _mask32_0 mask32_0
276# define _mask48_5 mask48_5
277# define _mask48_4 mask48_4
278# define _mask48_3 mask48_3
279# define _mask48_2 mask48_2
280# define _mask48_1 mask48_1
281# define _mask48_0 mask48_0
282# define _LBCarryMask LBCarryMask
283# define _HBClearMask HBClearMask
284# define _ActiveMask ActiveMask
285# define _ActiveMask2 ActiveMask2
286# define _ActiveMaskEnd ActiveMaskEnd
287# define _ShiftBpp ShiftBpp
288# define _ShiftRem ShiftRem
289#ifdef PNG_THREAD_UNSAFE_OK
290# define _unmask unmask
291# define _FullLength FullLength
292# define _MMXLength MMXLength
293# define _dif dif
294# define _patemp patemp
295# define _pbtemp pbtemp
296# define _pctemp pctemp
297#endif
298#endif
299
300
301/* These constants are used in the inlined MMX assembly code.
302 Ignore gcc's "At top level: defined but not used" warnings. */
303
304/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
305 * since that case uses the %ebx register for indexing the Global Offset Table
306 * and there were no other registers available. But gcc 2.95 and later emit
307 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
308 * in the non-PIC case, so we'll just use the global unconditionally now.
309 */
310#ifdef PNG_THREAD_UNSAFE_OK
311static int _unmask;
312#endif
313
314const static unsigned long long _mask8_0 = 0x0102040810204080LL;
315
316const static unsigned long long _mask16_1 = 0x0101020204040808LL;
317const static unsigned long long _mask16_0 = 0x1010202040408080LL;
318
319const static unsigned long long _mask24_2 = 0x0101010202020404LL;
320const static unsigned long long _mask24_1 = 0x0408080810101020LL;
321const static unsigned long long _mask24_0 = 0x2020404040808080LL;
322
323const static unsigned long long _mask32_3 = 0x0101010102020202LL;
324const static unsigned long long _mask32_2 = 0x0404040408080808LL;
325const static unsigned long long _mask32_1 = 0x1010101020202020LL;
326const static unsigned long long _mask32_0 = 0x4040404080808080LL;
327
328const static unsigned long long _mask48_5 = 0x0101010101010202LL;
329const static unsigned long long _mask48_4 = 0x0202020204040404LL;
330const static unsigned long long _mask48_3 = 0x0404080808080808LL;
331const static unsigned long long _mask48_2 = 0x1010101010102020LL;
332const static unsigned long long _mask48_1 = 0x2020202040404040LL;
333const static unsigned long long _mask48_0 = 0x4040808080808080LL;
334
335const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
336//const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
337const static unsigned long long _const6 = 0x00000000000000FFLL;
338
339// These are used in the row-filter routines and should/would be local
340// variables if not for gcc addressing limitations.
341// WARNING: Their presence probably defeats the thread safety of libpng.
342
343#ifdef PNG_THREAD_UNSAFE_OK
344static png_uint_32 _FullLength;
345static png_uint_32 _MMXLength;
346static int _dif;
347static int _patemp; // temp variables for Paeth routine
348static int _pbtemp;
349static int _pctemp;
350#endif
351
352void /* PRIVATE */
353png_squelch_warnings(void)
354{
355#ifdef PNG_THREAD_UNSAFE_OK
356 _dif = _dif;
357 _patemp = _patemp;
358 _pbtemp = _pbtemp;
359 _pctemp = _pctemp;
360 _MMXLength = _MMXLength;
361#endif
362 _const4 = _const4;
363 _const6 = _const6;
364 _mask8_0 = _mask8_0;
365 _mask16_1 = _mask16_1;
366 _mask16_0 = _mask16_0;
367 _mask24_2 = _mask24_2;
368 _mask24_1 = _mask24_1;
369 _mask24_0 = _mask24_0;
370 _mask32_3 = _mask32_3;
371 _mask32_2 = _mask32_2;
372 _mask32_1 = _mask32_1;
373 _mask32_0 = _mask32_0;
374 _mask48_5 = _mask48_5;
375 _mask48_4 = _mask48_4;
376 _mask48_3 = _mask48_3;
377 _mask48_2 = _mask48_2;
378 _mask48_1 = _mask48_1;
379 _mask48_0 = _mask48_0;
380}
381#endif /* PNG_MMX_CODE_SUPPORTED */
382
383
384static int _mmx_supported = 2;
385
386/*===========================================================================*/
387/* */
388/* P N G _ C O M B I N E _ R O W */
389/* */
390/*===========================================================================*/
391
392#if defined(PNG_HAVE_MMX_COMBINE_ROW)
393
394#define BPP2 2
395#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
396#define BPP4 4
397#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
398#define BPP8 8
399
400/* Combines the row recently read in with the previous row.
401 This routine takes care of alpha and transparency if requested.
402 This routine also handles the two methods of progressive display
403 of interlaced images, depending on the mask value.
404 The mask value describes which pixels are to be combined with
405 the row. The pattern always repeats every 8 pixels, so just 8
406 bits are needed. A one indicates the pixel is to be combined; a
407 zero indicates the pixel is to be skipped. This is in addition
408 to any alpha or transparency value associated with the pixel.
409 If you want all pixels to be combined, pass 0xff (255) in mask. */
410
411/* Use this routine for the x86 platform - it uses a faster MMX routine
412 if the machine supports MMX. */
413
414void /* PRIVATE */
415png_combine_row(png_structp png_ptr, png_bytep row, int mask)
416{
417 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
418
419#if defined(PNG_MMX_CODE_SUPPORTED)
420 if (_mmx_supported == 2) {
421#if !defined(PNG_1_0_X)
422 /* this should have happened in png_init_mmx_flags() already */
423 png_warning(png_ptr, "asm_flags may not have been initialized");
424#endif
425 png_mmx_support();
426 }
427#endif
428
429 if (mask == 0xff)
430 {
431 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
432 png_memcpy(row, png_ptr->row_buf + 1,
433 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
434 }
435 else /* (png_combine_row() is never called with mask == 0) */
436 {
437 switch (png_ptr->row_info.pixel_depth)
438 {
439 case 1: /* png_ptr->row_info.pixel_depth */
440 {
441 png_bytep sp;
442 png_bytep dp;
443 int s_inc, s_start, s_end;
444 int m;
445 int shift;
446 png_uint_32 i;
447
448 sp = png_ptr->row_buf + 1;
449 dp = row;
450 m = 0x80;
451#if defined(PNG_READ_PACKSWAP_SUPPORTED)
452 if (png_ptr->transformations & PNG_PACKSWAP)
453 {
454 s_start = 0;
455 s_end = 7;
456 s_inc = 1;
457 }
458 else
459#endif
460 {
461 s_start = 7;
462 s_end = 0;
463 s_inc = -1;
464 }
465
466 shift = s_start;
467
468 for (i = 0; i < png_ptr->width; i++)
469 {
470 if (m & mask)
471 {
472 int value;
473
474 value = (*sp >> shift) & 0x1;
475 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
476 *dp |= (png_byte)(value << shift);
477 }
478
479 if (shift == s_end)
480 {
481 shift = s_start;
482 sp++;
483 dp++;
484 }
485 else
486 shift += s_inc;
487
488 if (m == 1)
489 m = 0x80;
490 else
491 m >>= 1;
492 }
493 break;
494 }
495
496 case 2: /* png_ptr->row_info.pixel_depth */
497 {
498 png_bytep sp;
499 png_bytep dp;
500 int s_start, s_end, s_inc;
501 int m;
502 int shift;
503 png_uint_32 i;
504 int value;
505
506 sp = png_ptr->row_buf + 1;
507 dp = row;
508 m = 0x80;
509#if defined(PNG_READ_PACKSWAP_SUPPORTED)
510 if (png_ptr->transformations & PNG_PACKSWAP)
511 {
512 s_start = 0;
513 s_end = 6;
514 s_inc = 2;
515 }
516 else
517#endif
518 {
519 s_start = 6;
520 s_end = 0;
521 s_inc = -2;
522 }
523
524 shift = s_start;
525
526 for (i = 0; i < png_ptr->width; i++)
527 {
528 if (m & mask)
529 {
530 value = (*sp >> shift) & 0x3;
531 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
532 *dp |= (png_byte)(value << shift);
533 }
534
535 if (shift == s_end)
536 {
537 shift = s_start;
538 sp++;
539 dp++;
540 }
541 else
542 shift += s_inc;
543 if (m == 1)
544 m = 0x80;
545 else
546 m >>= 1;
547 }
548 break;
549 }
550
551 case 4: /* png_ptr->row_info.pixel_depth */
552 {
553 png_bytep sp;
554 png_bytep dp;
555 int s_start, s_end, s_inc;
556 int m;
557 int shift;
558 png_uint_32 i;
559 int value;
560
561 sp = png_ptr->row_buf + 1;
562 dp = row;
563 m = 0x80;
564#if defined(PNG_READ_PACKSWAP_SUPPORTED)
565 if (png_ptr->transformations & PNG_PACKSWAP)
566 {
567 s_start = 0;
568 s_end = 4;
569 s_inc = 4;
570 }
571 else
572#endif
573 {
574 s_start = 4;
575 s_end = 0;
576 s_inc = -4;
577 }
578 shift = s_start;
579
580 for (i = 0; i < png_ptr->width; i++)
581 {
582 if (m & mask)
583 {
584 value = (*sp >> shift) & 0xf;
585 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
586 *dp |= (png_byte)(value << shift);
587 }
588
589 if (shift == s_end)
590 {
591 shift = s_start;
592 sp++;
593 dp++;
594 }
595 else
596 shift += s_inc;
597 if (m == 1)
598 m = 0x80;
599 else
600 m >>= 1;
601 }
602 break;
603 }
604
605 case 8: /* png_ptr->row_info.pixel_depth */
606 {
607 png_bytep srcptr;
608 png_bytep dstptr;
609
610#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
611#if !defined(PNG_1_0_X)
612 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
613 /* && _mmx_supported */ )
614#else
615 if (_mmx_supported)
616#endif
617 {
618 png_uint_32 len;
619 int diff;
620 int dummy_value_a; // fix 'forbidden register spilled' error
621 int dummy_value_d;
622 int dummy_value_c;
623 int dummy_value_S;
624 int dummy_value_D;
625 _unmask = ~mask; // global variable for -fPIC version
626 srcptr = png_ptr->row_buf + 1;
627 dstptr = row;
628 len = png_ptr->width &~7; // reduce to multiple of 8
629 diff = (int) (png_ptr->width & 7); // amount lost
630
631 __asm__ __volatile__ (
632 "movd _unmask, %%mm7 \n\t" // load bit pattern
633 "psubb %%mm6, %%mm6 \n\t" // zero mm6
634 "punpcklbw %%mm7, %%mm7 \n\t"
635 "punpcklwd %%mm7, %%mm7 \n\t"
636 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
637
638 "movq _mask8_0, %%mm0 \n\t"
639 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
640 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
641
642// preload "movl len, %%ecx \n\t" // load length of line
643// preload "movl srcptr, %%esi \n\t" // load source
644// preload "movl dstptr, %%edi \n\t" // load dest
645
646 "cmpl $0, %%ecx \n\t" // len == 0 ?
647 "je mainloop8end \n\t"
648
649 "mainloop8: \n\t"
650 "movq (%%esi), %%mm4 \n\t" // *srcptr
651 "pand %%mm0, %%mm4 \n\t"
652 "movq %%mm0, %%mm6 \n\t"
653 "pandn (%%edi), %%mm6 \n\t" // *dstptr
654 "por %%mm6, %%mm4 \n\t"
655 "movq %%mm4, (%%edi) \n\t"
656 "addl $8, %%esi \n\t" // inc by 8 bytes processed
657 "addl $8, %%edi \n\t"
658 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
659 "ja mainloop8 \n\t"
660
661 "mainloop8end: \n\t"
662// preload "movl diff, %%ecx \n\t" // (diff is in eax)
663 "movl %%eax, %%ecx \n\t"
664 "cmpl $0, %%ecx \n\t"
665 "jz end8 \n\t"
666// preload "movl mask, %%edx \n\t"
667 "sall $24, %%edx \n\t" // make low byte, high byte
668
669 "secondloop8: \n\t"
670 "sall %%edx \n\t" // move high bit to CF
671 "jnc skip8 \n\t" // if CF = 0
672 "movb (%%esi), %%al \n\t"
673 "movb %%al, (%%edi) \n\t"
674
675 "skip8: \n\t"
676 "incl %%esi \n\t"
677 "incl %%edi \n\t"
678 "decl %%ecx \n\t"
679 "jnz secondloop8 \n\t"
680
681 "end8: \n\t"
682 "EMMS \n\t" // DONE
683
684 : "=a" (dummy_value_a), // output regs (dummy)
685 "=d" (dummy_value_d),
686 "=c" (dummy_value_c),
687 "=S" (dummy_value_S),
688 "=D" (dummy_value_D)
689
690 : "3" (srcptr), // esi // input regs
691 "4" (dstptr), // edi
692 "0" (diff), // eax
693// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
694 "2" (len), // ecx
695 "1" (mask) // edx
696
697#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
698 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
699#endif
700 );
701 }
702 else /* mmx _not supported - Use modified C routine */
703#endif /* PNG_MMX_CODE_SUPPORTED */
704 {
705 register png_uint_32 i;
706 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
707 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
708 register int stride = png_pass_inc[png_ptr->pass];
709 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
710 register int rep_bytes = png_pass_width[png_ptr->pass];
711 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
712 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
713 int diff = (int) (png_ptr->width & 7); /* amount lost */
714 register png_uint_32 final_val = len; /* GRR bugfix */
715
716 srcptr = png_ptr->row_buf + 1 + initial_val;
717 dstptr = row + initial_val;
718
719 for (i = initial_val; i < final_val; i += stride)
720 {
721 png_memcpy(dstptr, srcptr, rep_bytes);
722 srcptr += stride;
723 dstptr += stride;
724 }
725 if (diff) /* number of leftover pixels: 3 for pngtest */
726 {
727 final_val+=diff /* *BPP1 */ ;
728 for (; i < final_val; i += stride)
729 {
730 if (rep_bytes > (int)(final_val-i))
731 rep_bytes = (int)(final_val-i);
732 png_memcpy(dstptr, srcptr, rep_bytes);
733 srcptr += stride;
734 dstptr += stride;
735 }
736 }
737
738 } /* end of else (_mmx_supported) */
739
740 break;
741 } /* end 8 bpp */
742
743 case 16: /* png_ptr->row_info.pixel_depth */
744 {
745 png_bytep srcptr;
746 png_bytep dstptr;
747
748#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
749#if !defined(PNG_1_0_X)
750 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
751 /* && _mmx_supported */ )
752#else
753 if (_mmx_supported)
754#endif
755 {
756 png_uint_32 len;
757 int diff;
758 int dummy_value_a; // fix 'forbidden register spilled' error
759 int dummy_value_d;
760 int dummy_value_c;
761 int dummy_value_S;
762 int dummy_value_D;
763 _unmask = ~mask; // global variable for -fPIC version
764 srcptr = png_ptr->row_buf + 1;
765 dstptr = row;
766 len = png_ptr->width &~7; // reduce to multiple of 8
767 diff = (int) (png_ptr->width & 7); // amount lost //
768
769 __asm__ __volatile__ (
770 "movd _unmask, %%mm7 \n\t" // load bit pattern
771 "psubb %%mm6, %%mm6 \n\t" // zero mm6
772 "punpcklbw %%mm7, %%mm7 \n\t"
773 "punpcklwd %%mm7, %%mm7 \n\t"
774 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
775
776 "movq _mask16_0, %%mm0 \n\t"
777 "movq _mask16_1, %%mm1 \n\t"
778
779 "pand %%mm7, %%mm0 \n\t"
780 "pand %%mm7, %%mm1 \n\t"
781
782 "pcmpeqb %%mm6, %%mm0 \n\t"
783 "pcmpeqb %%mm6, %%mm1 \n\t"
784
785// preload "movl len, %%ecx \n\t" // load length of line
786// preload "movl srcptr, %%esi \n\t" // load source
787// preload "movl dstptr, %%edi \n\t" // load dest
788
789 "cmpl $0, %%ecx \n\t"
790 "jz mainloop16end \n\t"
791
792 "mainloop16: \n\t"
793 "movq (%%esi), %%mm4 \n\t"
794 "pand %%mm0, %%mm4 \n\t"
795 "movq %%mm0, %%mm6 \n\t"
796 "movq (%%edi), %%mm7 \n\t"
797 "pandn %%mm7, %%mm6 \n\t"
798 "por %%mm6, %%mm4 \n\t"
799 "movq %%mm4, (%%edi) \n\t"
800
801 "movq 8(%%esi), %%mm5 \n\t"
802 "pand %%mm1, %%mm5 \n\t"
803 "movq %%mm1, %%mm7 \n\t"
804 "movq 8(%%edi), %%mm6 \n\t"
805 "pandn %%mm6, %%mm7 \n\t"
806 "por %%mm7, %%mm5 \n\t"
807 "movq %%mm5, 8(%%edi) \n\t"
808
809 "addl $16, %%esi \n\t" // inc by 16 bytes processed
810 "addl $16, %%edi \n\t"
811 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
812 "ja mainloop16 \n\t"
813
814 "mainloop16end: \n\t"
815// preload "movl diff, %%ecx \n\t" // (diff is in eax)
816 "movl %%eax, %%ecx \n\t"
817 "cmpl $0, %%ecx \n\t"
818 "jz end16 \n\t"
819// preload "movl mask, %%edx \n\t"
820 "sall $24, %%edx \n\t" // make low byte, high byte
821
822 "secondloop16: \n\t"
823 "sall %%edx \n\t" // move high bit to CF
824 "jnc skip16 \n\t" // if CF = 0
825 "movw (%%esi), %%ax \n\t"
826 "movw %%ax, (%%edi) \n\t"
827
828 "skip16: \n\t"
829 "addl $2, %%esi \n\t"
830 "addl $2, %%edi \n\t"
831 "decl %%ecx \n\t"
832 "jnz secondloop16 \n\t"
833
834 "end16: \n\t"
835 "EMMS \n\t" // DONE
836
837 : "=a" (dummy_value_a), // output regs (dummy)
838 "=c" (dummy_value_c),
839 "=d" (dummy_value_d),
840 "=S" (dummy_value_S),
841 "=D" (dummy_value_D)
842
843 : "0" (diff), // eax // input regs
844// was (unmask) " " RESERVED // ebx // Global Offset Table idx
845 "1" (len), // ecx
846 "2" (mask), // edx
847 "3" (srcptr), // esi
848 "4" (dstptr) // edi
849
850#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
851 : "%mm0", "%mm1", "%mm4" // clobber list
852 , "%mm5", "%mm6", "%mm7"
853#endif
854 );
855 }
856 else /* mmx _not supported - Use modified C routine */
857#endif /* PNG_MMX_CODE_SUPPORTED */
858 {
859 register png_uint_32 i;
860 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
861 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
862 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
863 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
864 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
865 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
866 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
867 int diff = (int) (png_ptr->width & 7); /* amount lost */
868 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
869
870 srcptr = png_ptr->row_buf + 1 + initial_val;
871 dstptr = row + initial_val;
872
873 for (i = initial_val; i < final_val; i += stride)
874 {
875 png_memcpy(dstptr, srcptr, rep_bytes);
876 srcptr += stride;
877 dstptr += stride;
878 }
879 if (diff) /* number of leftover pixels: 3 for pngtest */
880 {
881 final_val+=diff*BPP2;
882 for (; i < final_val; i += stride)
883 {
884 if (rep_bytes > (int)(final_val-i))
885 rep_bytes = (int)(final_val-i);
886 png_memcpy(dstptr, srcptr, rep_bytes);
887 srcptr += stride;
888 dstptr += stride;
889 }
890 }
891 } /* end of else (_mmx_supported) */
892
893 break;
894 } /* end 16 bpp */
895
896 case 24: /* png_ptr->row_info.pixel_depth */
897 {
898 png_bytep srcptr;
899 png_bytep dstptr;
900
901#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
902#if !defined(PNG_1_0_X)
903 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
904 /* && _mmx_supported */ )
905#else
906 if (_mmx_supported)
907#endif
908 {
909 png_uint_32 len;
910 int diff;
911 int dummy_value_a; // fix 'forbidden register spilled' error
912 int dummy_value_d;
913 int dummy_value_c;
914 int dummy_value_S;
915 int dummy_value_D;
916 _unmask = ~mask; // global variable for -fPIC version
917 srcptr = png_ptr->row_buf + 1;
918 dstptr = row;
919 len = png_ptr->width &~7; // reduce to multiple of 8
920 diff = (int) (png_ptr->width & 7); // amount lost //
921
922 __asm__ __volatile__ (
923 "movd _unmask, %%mm7 \n\t" // load bit pattern
924 "psubb %%mm6, %%mm6 \n\t" // zero mm6
925 "punpcklbw %%mm7, %%mm7 \n\t"
926 "punpcklwd %%mm7, %%mm7 \n\t"
927 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
928
929 "movq _mask24_0, %%mm0 \n\t"
930 "movq _mask24_1, %%mm1 \n\t"
931 "movq _mask24_2, %%mm2 \n\t"
932
933 "pand %%mm7, %%mm0 \n\t"
934 "pand %%mm7, %%mm1 \n\t"
935 "pand %%mm7, %%mm2 \n\t"
936
937 "pcmpeqb %%mm6, %%mm0 \n\t"
938 "pcmpeqb %%mm6, %%mm1 \n\t"
939 "pcmpeqb %%mm6, %%mm2 \n\t"
940
941// preload "movl len, %%ecx \n\t" // load length of line
942// preload "movl srcptr, %%esi \n\t" // load source
943// preload "movl dstptr, %%edi \n\t" // load dest
944
945 "cmpl $0, %%ecx \n\t"
946 "jz mainloop24end \n\t"
947
948 "mainloop24: \n\t"
949 "movq (%%esi), %%mm4 \n\t"
950 "pand %%mm0, %%mm4 \n\t"
951 "movq %%mm0, %%mm6 \n\t"
952 "movq (%%edi), %%mm7 \n\t"
953 "pandn %%mm7, %%mm6 \n\t"
954 "por %%mm6, %%mm4 \n\t"
955 "movq %%mm4, (%%edi) \n\t"
956
957 "movq 8(%%esi), %%mm5 \n\t"
958 "pand %%mm1, %%mm5 \n\t"
959 "movq %%mm1, %%mm7 \n\t"
960 "movq 8(%%edi), %%mm6 \n\t"
961 "pandn %%mm6, %%mm7 \n\t"
962 "por %%mm7, %%mm5 \n\t"
963 "movq %%mm5, 8(%%edi) \n\t"
964
965 "movq 16(%%esi), %%mm6 \n\t"
966 "pand %%mm2, %%mm6 \n\t"
967 "movq %%mm2, %%mm4 \n\t"
968 "movq 16(%%edi), %%mm7 \n\t"
969 "pandn %%mm7, %%mm4 \n\t"
970 "por %%mm4, %%mm6 \n\t"
971 "movq %%mm6, 16(%%edi) \n\t"
972
973 "addl $24, %%esi \n\t" // inc by 24 bytes processed
974 "addl $24, %%edi \n\t"
975 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
976
977 "ja mainloop24 \n\t"
978
979 "mainloop24end: \n\t"
980// preload "movl diff, %%ecx \n\t" // (diff is in eax)
981 "movl %%eax, %%ecx \n\t"
982 "cmpl $0, %%ecx \n\t"
983 "jz end24 \n\t"
984// preload "movl mask, %%edx \n\t"
985 "sall $24, %%edx \n\t" // make low byte, high byte
986
987 "secondloop24: \n\t"
988 "sall %%edx \n\t" // move high bit to CF
989 "jnc skip24 \n\t" // if CF = 0
990 "movw (%%esi), %%ax \n\t"
991 "movw %%ax, (%%edi) \n\t"
992 "xorl %%eax, %%eax \n\t"
993 "movb 2(%%esi), %%al \n\t"
994 "movb %%al, 2(%%edi) \n\t"
995
996 "skip24: \n\t"
997 "addl $3, %%esi \n\t"
998 "addl $3, %%edi \n\t"
999 "decl %%ecx \n\t"
1000 "jnz secondloop24 \n\t"
1001
1002 "end24: \n\t"
1003 "EMMS \n\t" // DONE
1004
1005 : "=a" (dummy_value_a), // output regs (dummy)
1006 "=d" (dummy_value_d),
1007 "=c" (dummy_value_c),
1008 "=S" (dummy_value_S),
1009 "=D" (dummy_value_D)
1010
1011 : "3" (srcptr), // esi // input regs
1012 "4" (dstptr), // edi
1013 "0" (diff), // eax
1014// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1015 "2" (len), // ecx
1016 "1" (mask) // edx
1017
1018#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1019 : "%mm0", "%mm1", "%mm2" // clobber list
1020 , "%mm4", "%mm5", "%mm6", "%mm7"
1021#endif
1022 );
1023 }
1024 else /* mmx _not supported - Use modified C routine */
1025#endif /* PNG_MMX_CODE_SUPPORTED */
1026 {
1027 register png_uint_32 i;
1028 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1029 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1030 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1031 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1032 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1033 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1034 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1035 int diff = (int) (png_ptr->width & 7); /* amount lost */
1036 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1037
1038 srcptr = png_ptr->row_buf + 1 + initial_val;
1039 dstptr = row + initial_val;
1040
1041 for (i = initial_val; i < final_val; i += stride)
1042 {
1043 png_memcpy(dstptr, srcptr, rep_bytes);
1044 srcptr += stride;
1045 dstptr += stride;
1046 }
1047 if (diff) /* number of leftover pixels: 3 for pngtest */
1048 {
1049 final_val+=diff*BPP3;
1050 for (; i < final_val; i += stride)
1051 {
1052 if (rep_bytes > (int)(final_val-i))
1053 rep_bytes = (int)(final_val-i);
1054 png_memcpy(dstptr, srcptr, rep_bytes);
1055 srcptr += stride;
1056 dstptr += stride;
1057 }
1058 }
1059 } /* end of else (_mmx_supported) */
1060
1061 break;
1062 } /* end 24 bpp */
1063
1064 case 32: /* png_ptr->row_info.pixel_depth */
1065 {
1066 png_bytep srcptr;
1067 png_bytep dstptr;
1068
1069#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1070#if !defined(PNG_1_0_X)
1071 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1072 /* && _mmx_supported */ )
1073#else
1074 if (_mmx_supported)
1075#endif
1076 {
1077 png_uint_32 len;
1078 int diff;
1079 int dummy_value_a; // fix 'forbidden register spilled' error
1080 int dummy_value_d;
1081 int dummy_value_c;
1082 int dummy_value_S;
1083 int dummy_value_D;
1084 _unmask = ~mask; // global variable for -fPIC version
1085 srcptr = png_ptr->row_buf + 1;
1086 dstptr = row;
1087 len = png_ptr->width &~7; // reduce to multiple of 8
1088 diff = (int) (png_ptr->width & 7); // amount lost //
1089
1090 __asm__ __volatile__ (
1091 "movd _unmask, %%mm7 \n\t" // load bit pattern
1092 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1093 "punpcklbw %%mm7, %%mm7 \n\t"
1094 "punpcklwd %%mm7, %%mm7 \n\t"
1095 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1096
1097 "movq _mask32_0, %%mm0 \n\t"
1098 "movq _mask32_1, %%mm1 \n\t"
1099 "movq _mask32_2, %%mm2 \n\t"
1100 "movq _mask32_3, %%mm3 \n\t"
1101
1102 "pand %%mm7, %%mm0 \n\t"
1103 "pand %%mm7, %%mm1 \n\t"
1104 "pand %%mm7, %%mm2 \n\t"
1105 "pand %%mm7, %%mm3 \n\t"
1106
1107 "pcmpeqb %%mm6, %%mm0 \n\t"
1108 "pcmpeqb %%mm6, %%mm1 \n\t"
1109 "pcmpeqb %%mm6, %%mm2 \n\t"
1110 "pcmpeqb %%mm6, %%mm3 \n\t"
1111
1112// preload "movl len, %%ecx \n\t" // load length of line
1113// preload "movl srcptr, %%esi \n\t" // load source
1114// preload "movl dstptr, %%edi \n\t" // load dest
1115
1116 "cmpl $0, %%ecx \n\t" // lcr
1117 "jz mainloop32end \n\t"
1118
1119 "mainloop32: \n\t"
1120 "movq (%%esi), %%mm4 \n\t"
1121 "pand %%mm0, %%mm4 \n\t"
1122 "movq %%mm0, %%mm6 \n\t"
1123 "movq (%%edi), %%mm7 \n\t"
1124 "pandn %%mm7, %%mm6 \n\t"
1125 "por %%mm6, %%mm4 \n\t"
1126 "movq %%mm4, (%%edi) \n\t"
1127
1128 "movq 8(%%esi), %%mm5 \n\t"
1129 "pand %%mm1, %%mm5 \n\t"
1130 "movq %%mm1, %%mm7 \n\t"
1131 "movq 8(%%edi), %%mm6 \n\t"
1132 "pandn %%mm6, %%mm7 \n\t"
1133 "por %%mm7, %%mm5 \n\t"
1134 "movq %%mm5, 8(%%edi) \n\t"
1135
1136 "movq 16(%%esi), %%mm6 \n\t"
1137 "pand %%mm2, %%mm6 \n\t"
1138 "movq %%mm2, %%mm4 \n\t"
1139 "movq 16(%%edi), %%mm7 \n\t"
1140 "pandn %%mm7, %%mm4 \n\t"
1141 "por %%mm4, %%mm6 \n\t"
1142 "movq %%mm6, 16(%%edi) \n\t"
1143
1144 "movq 24(%%esi), %%mm7 \n\t"
1145 "pand %%mm3, %%mm7 \n\t"
1146 "movq %%mm3, %%mm5 \n\t"
1147 "movq 24(%%edi), %%mm4 \n\t"
1148 "pandn %%mm4, %%mm5 \n\t"
1149 "por %%mm5, %%mm7 \n\t"
1150 "movq %%mm7, 24(%%edi) \n\t"
1151
1152 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1153 "addl $32, %%edi \n\t"
1154 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1155 "ja mainloop32 \n\t"
1156
1157 "mainloop32end: \n\t"
1158// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1159 "movl %%eax, %%ecx \n\t"
1160 "cmpl $0, %%ecx \n\t"
1161 "jz end32 \n\t"
1162// preload "movl mask, %%edx \n\t"
1163 "sall $24, %%edx \n\t" // low byte => high byte
1164
1165 "secondloop32: \n\t"
1166 "sall %%edx \n\t" // move high bit to CF
1167 "jnc skip32 \n\t" // if CF = 0
1168 "movl (%%esi), %%eax \n\t"
1169 "movl %%eax, (%%edi) \n\t"
1170
1171 "skip32: \n\t"
1172 "addl $4, %%esi \n\t"
1173 "addl $4, %%edi \n\t"
1174 "decl %%ecx \n\t"
1175 "jnz secondloop32 \n\t"
1176
1177 "end32: \n\t"
1178 "EMMS \n\t" // DONE
1179
1180 : "=a" (dummy_value_a), // output regs (dummy)
1181 "=d" (dummy_value_d),
1182 "=c" (dummy_value_c),
1183 "=S" (dummy_value_S),
1184 "=D" (dummy_value_D)
1185
1186 : "3" (srcptr), // esi // input regs
1187 "4" (dstptr), // edi
1188 "0" (diff), // eax
1189// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1190 "2" (len), // ecx
1191 "1" (mask) // edx
1192
1193#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1194 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1195 , "%mm4", "%mm5", "%mm6", "%mm7"
1196#endif
1197 );
1198 }
1199 else /* mmx _not supported - Use modified C routine */
1200#endif /* PNG_MMX_CODE_SUPPORTED */
1201 {
1202 register png_uint_32 i;
1203 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1204 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1205 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1206 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1207 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1208 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1209 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1210 int diff = (int) (png_ptr->width & 7); /* amount lost */
1211 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1212
1213 srcptr = png_ptr->row_buf + 1 + initial_val;
1214 dstptr = row + initial_val;
1215
1216 for (i = initial_val; i < final_val; i += stride)
1217 {
1218 png_memcpy(dstptr, srcptr, rep_bytes);
1219 srcptr += stride;
1220 dstptr += stride;
1221 }
1222 if (diff) /* number of leftover pixels: 3 for pngtest */
1223 {
1224 final_val+=diff*BPP4;
1225 for (; i < final_val; i += stride)
1226 {
1227 if (rep_bytes > (int)(final_val-i))
1228 rep_bytes = (int)(final_val-i);
1229 png_memcpy(dstptr, srcptr, rep_bytes);
1230 srcptr += stride;
1231 dstptr += stride;
1232 }
1233 }
1234 } /* end of else (_mmx_supported) */
1235
1236 break;
1237 } /* end 32 bpp */
1238
1239 case 48: /* png_ptr->row_info.pixel_depth */
1240 {
1241 png_bytep srcptr;
1242 png_bytep dstptr;
1243
1244#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1245#if !defined(PNG_1_0_X)
1246 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1247 /* && _mmx_supported */ )
1248#else
1249 if (_mmx_supported)
1250#endif
1251 {
1252 png_uint_32 len;
1253 int diff;
1254 int dummy_value_a; // fix 'forbidden register spilled' error
1255 int dummy_value_d;
1256 int dummy_value_c;
1257 int dummy_value_S;
1258 int dummy_value_D;
1259 _unmask = ~mask; // global variable for -fPIC version
1260 srcptr = png_ptr->row_buf + 1;
1261 dstptr = row;
1262 len = png_ptr->width &~7; // reduce to multiple of 8
1263 diff = (int) (png_ptr->width & 7); // amount lost //
1264
1265 __asm__ __volatile__ (
1266 "movd _unmask, %%mm7 \n\t" // load bit pattern
1267 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1268 "punpcklbw %%mm7, %%mm7 \n\t"
1269 "punpcklwd %%mm7, %%mm7 \n\t"
1270 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1271
1272 "movq _mask48_0, %%mm0 \n\t"
1273 "movq _mask48_1, %%mm1 \n\t"
1274 "movq _mask48_2, %%mm2 \n\t"
1275 "movq _mask48_3, %%mm3 \n\t"
1276 "movq _mask48_4, %%mm4 \n\t"
1277 "movq _mask48_5, %%mm5 \n\t"
1278
1279 "pand %%mm7, %%mm0 \n\t"
1280 "pand %%mm7, %%mm1 \n\t"
1281 "pand %%mm7, %%mm2 \n\t"
1282 "pand %%mm7, %%mm3 \n\t"
1283 "pand %%mm7, %%mm4 \n\t"
1284 "pand %%mm7, %%mm5 \n\t"
1285
1286 "pcmpeqb %%mm6, %%mm0 \n\t"
1287 "pcmpeqb %%mm6, %%mm1 \n\t"
1288 "pcmpeqb %%mm6, %%mm2 \n\t"
1289 "pcmpeqb %%mm6, %%mm3 \n\t"
1290 "pcmpeqb %%mm6, %%mm4 \n\t"
1291 "pcmpeqb %%mm6, %%mm5 \n\t"
1292
1293// preload "movl len, %%ecx \n\t" // load length of line
1294// preload "movl srcptr, %%esi \n\t" // load source
1295// preload "movl dstptr, %%edi \n\t" // load dest
1296
1297 "cmpl $0, %%ecx \n\t"
1298 "jz mainloop48end \n\t"
1299
1300 "mainloop48: \n\t"
1301 "movq (%%esi), %%mm7 \n\t"
1302 "pand %%mm0, %%mm7 \n\t"
1303 "movq %%mm0, %%mm6 \n\t"
1304 "pandn (%%edi), %%mm6 \n\t"
1305 "por %%mm6, %%mm7 \n\t"
1306 "movq %%mm7, (%%edi) \n\t"
1307
1308 "movq 8(%%esi), %%mm6 \n\t"
1309 "pand %%mm1, %%mm6 \n\t"
1310 "movq %%mm1, %%mm7 \n\t"
1311 "pandn 8(%%edi), %%mm7 \n\t"
1312 "por %%mm7, %%mm6 \n\t"
1313 "movq %%mm6, 8(%%edi) \n\t"
1314
1315 "movq 16(%%esi), %%mm6 \n\t"
1316 "pand %%mm2, %%mm6 \n\t"
1317 "movq %%mm2, %%mm7 \n\t"
1318 "pandn 16(%%edi), %%mm7 \n\t"
1319 "por %%mm7, %%mm6 \n\t"
1320 "movq %%mm6, 16(%%edi) \n\t"
1321
1322 "movq 24(%%esi), %%mm7 \n\t"
1323 "pand %%mm3, %%mm7 \n\t"
1324 "movq %%mm3, %%mm6 \n\t"
1325 "pandn 24(%%edi), %%mm6 \n\t"
1326 "por %%mm6, %%mm7 \n\t"
1327 "movq %%mm7, 24(%%edi) \n\t"
1328
1329 "movq 32(%%esi), %%mm6 \n\t"
1330 "pand %%mm4, %%mm6 \n\t"
1331 "movq %%mm4, %%mm7 \n\t"
1332 "pandn 32(%%edi), %%mm7 \n\t"
1333 "por %%mm7, %%mm6 \n\t"
1334 "movq %%mm6, 32(%%edi) \n\t"
1335
1336 "movq 40(%%esi), %%mm7 \n\t"
1337 "pand %%mm5, %%mm7 \n\t"
1338 "movq %%mm5, %%mm6 \n\t"
1339 "pandn 40(%%edi), %%mm6 \n\t"
1340 "por %%mm6, %%mm7 \n\t"
1341 "movq %%mm7, 40(%%edi) \n\t"
1342
1343 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1344 "addl $48, %%edi \n\t"
1345 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1346
1347 "ja mainloop48 \n\t"
1348
1349 "mainloop48end: \n\t"
1350// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1351 "movl %%eax, %%ecx \n\t"
1352 "cmpl $0, %%ecx \n\t"
1353 "jz end48 \n\t"
1354// preload "movl mask, %%edx \n\t"
1355 "sall $24, %%edx \n\t" // make low byte, high byte
1356
1357 "secondloop48: \n\t"
1358 "sall %%edx \n\t" // move high bit to CF
1359 "jnc skip48 \n\t" // if CF = 0
1360 "movl (%%esi), %%eax \n\t"
1361 "movl %%eax, (%%edi) \n\t"
1362
1363 "skip48: \n\t"
1364 "addl $4, %%esi \n\t"
1365 "addl $4, %%edi \n\t"
1366 "decl %%ecx \n\t"
1367 "jnz secondloop48 \n\t"
1368
1369 "end48: \n\t"
1370 "EMMS \n\t" // DONE
1371
1372 : "=a" (dummy_value_a), // output regs (dummy)
1373 "=d" (dummy_value_d),
1374 "=c" (dummy_value_c),
1375 "=S" (dummy_value_S),
1376 "=D" (dummy_value_D)
1377
1378 : "3" (srcptr), // esi // input regs
1379 "4" (dstptr), // edi
1380 "0" (diff), // eax
1381// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1382 "2" (len), // ecx
1383 "1" (mask) // edx
1384
1385#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1386 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1387 , "%mm4", "%mm5", "%mm6", "%mm7"
1388#endif
1389 );
1390 }
1391 else /* mmx _not supported - Use modified C routine */
1392#endif /* PNG_MMX_CODE_SUPPORTED */
1393 {
1394 register png_uint_32 i;
1395 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1396 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1397 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1398 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1399 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1400 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1401 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1402 int diff = (int) (png_ptr->width & 7); /* amount lost */
1403 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1404
1405 srcptr = png_ptr->row_buf + 1 + initial_val;
1406 dstptr = row + initial_val;
1407
1408 for (i = initial_val; i < final_val; i += stride)
1409 {
1410 png_memcpy(dstptr, srcptr, rep_bytes);
1411 srcptr += stride;
1412 dstptr += stride;
1413 }
1414 if (diff) /* number of leftover pixels: 3 for pngtest */
1415 {
1416 final_val+=diff*BPP6;
1417 for (; i < final_val; i += stride)
1418 {
1419 if (rep_bytes > (int)(final_val-i))
1420 rep_bytes = (int)(final_val-i);
1421 png_memcpy(dstptr, srcptr, rep_bytes);
1422 srcptr += stride;
1423 dstptr += stride;
1424 }
1425 }
1426 } /* end of else (_mmx_supported) */
1427
1428 break;
1429 } /* end 48 bpp */
1430
1431 case 64: /* png_ptr->row_info.pixel_depth */
1432 {
1433 png_bytep srcptr;
1434 png_bytep dstptr;
1435 register png_uint_32 i;
1436 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1437 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1438 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1439 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1440 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1441 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1442 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1443 int diff = (int) (png_ptr->width & 7); /* amount lost */
1444 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1445
1446 srcptr = png_ptr->row_buf + 1 + initial_val;
1447 dstptr = row + initial_val;
1448
1449 for (i = initial_val; i < final_val; i += stride)
1450 {
1451 png_memcpy(dstptr, srcptr, rep_bytes);
1452 srcptr += stride;
1453 dstptr += stride;
1454 }
1455 if (diff) /* number of leftover pixels: 3 for pngtest */
1456 {
1457 final_val+=diff*BPP8;
1458 for (; i < final_val; i += stride)
1459 {
1460 if (rep_bytes > (int)(final_val-i))
1461 rep_bytes = (int)(final_val-i);
1462 png_memcpy(dstptr, srcptr, rep_bytes);
1463 srcptr += stride;
1464 dstptr += stride;
1465 }
1466 }
1467
1468 break;
1469 } /* end 64 bpp */
1470
1471 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1472 {
1473 /* this should never happen */
1474 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1475 break;
1476 }
1477 } /* end switch (png_ptr->row_info.pixel_depth) */
1478
1479 } /* end if (non-trivial mask) */
1480
1481} /* end png_combine_row() */
1482
1483#endif /* PNG_HAVE_MMX_COMBINE_ROW */
1484
1485
1486
1487
1488/*===========================================================================*/
1489/* */
1490/* P N G _ D O _ R E A D _ I N T E R L A C E */
1491/* */
1492/*===========================================================================*/
1493
1494#if defined(PNG_READ_INTERLACING_SUPPORTED)
1495#if defined(PNG_HAVE_MMX_READ_INTERLACE)
1496
1497/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1498 * has taken place. [GRR: what other steps come before and/or after?]
1499 */
1500
1501void /* PRIVATE */
1502png_do_read_interlace(png_structp png_ptr)
1503{
1504 png_row_infop row_info = &(png_ptr->row_info);
1505 png_bytep row = png_ptr->row_buf + 1;
1506 int pass = png_ptr->pass;
1507#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1508 png_uint_32 transformations = png_ptr->transformations;
1509#endif
1510
1511 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1512
1513#if defined(PNG_MMX_CODE_SUPPORTED)
1514 if (_mmx_supported == 2) {
1515#if !defined(PNG_1_0_X)
1516 /* this should have happened in png_init_mmx_flags() already */
1517 png_warning(png_ptr, "asm_flags may not have been initialized");
1518#endif
1519 png_mmx_support();
1520 }
1521#endif
1522
1523 if (row != NULL && row_info != NULL)
1524 {
1525 png_uint_32 final_width;
1526
1527 final_width = row_info->width * png_pass_inc[pass];
1528
1529 switch (row_info->pixel_depth)
1530 {
1531 case 1:
1532 {
1533 png_bytep sp, dp;
1534 int sshift, dshift;
1535 int s_start, s_end, s_inc;
1536 png_byte v;
1537 png_uint_32 i;
1538 int j;
1539
1540 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1541 dp = row + (png_size_t)((final_width - 1) >> 3);
1542#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1543 if (transformations & PNG_PACKSWAP)
1544 {
1545 sshift = (int)((row_info->width + 7) & 7);
1546 dshift = (int)((final_width + 7) & 7);
1547 s_start = 7;
1548 s_end = 0;
1549 s_inc = -1;
1550 }
1551 else
1552#endif
1553 {
1554 sshift = 7 - (int)((row_info->width + 7) & 7);
1555 dshift = 7 - (int)((final_width + 7) & 7);
1556 s_start = 0;
1557 s_end = 7;
1558 s_inc = 1;
1559 }
1560
1561 for (i = row_info->width; i; i--)
1562 {
1563 v = (png_byte)((*sp >> sshift) & 0x1);
1564 for (j = 0; j < png_pass_inc[pass]; j++)
1565 {
1566 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1567 *dp |= (png_byte)(v << dshift);
1568 if (dshift == s_end)
1569 {
1570 dshift = s_start;
1571 dp--;
1572 }
1573 else
1574 dshift += s_inc;
1575 }
1576 if (sshift == s_end)
1577 {
1578 sshift = s_start;
1579 sp--;
1580 }
1581 else
1582 sshift += s_inc;
1583 }
1584 break;
1585 }
1586
1587 case 2:
1588 {
1589 png_bytep sp, dp;
1590 int sshift, dshift;
1591 int s_start, s_end, s_inc;
1592 png_uint_32 i;
1593
1594 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1595 dp = row + (png_size_t)((final_width - 1) >> 2);
1596#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1597 if (transformations & PNG_PACKSWAP)
1598 {
1599 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1600 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1601 s_start = 6;
1602 s_end = 0;
1603 s_inc = -2;
1604 }
1605 else
1606#endif
1607 {
1608 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1609 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1610 s_start = 0;
1611 s_end = 6;
1612 s_inc = 2;
1613 }
1614
1615 for (i = row_info->width; i; i--)
1616 {
1617 png_byte v;
1618 int j;
1619
1620 v = (png_byte)((*sp >> sshift) & 0x3);
1621 for (j = 0; j < png_pass_inc[pass]; j++)
1622 {
1623 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1624 *dp |= (png_byte)(v << dshift);
1625 if (dshift == s_end)
1626 {
1627 dshift = s_start;
1628 dp--;
1629 }
1630 else
1631 dshift += s_inc;
1632 }
1633 if (sshift == s_end)
1634 {
1635 sshift = s_start;
1636 sp--;
1637 }
1638 else
1639 sshift += s_inc;
1640 }
1641 break;
1642 }
1643
1644 case 4:
1645 {
1646 png_bytep sp, dp;
1647 int sshift, dshift;
1648 int s_start, s_end, s_inc;
1649 png_uint_32 i;
1650
1651 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1652 dp = row + (png_size_t)((final_width - 1) >> 1);
1653#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1654 if (transformations & PNG_PACKSWAP)
1655 {
1656 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1657 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1658 s_start = 4;
1659 s_end = 0;
1660 s_inc = -4;
1661 }
1662 else
1663#endif
1664 {
1665 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1666 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1667 s_start = 0;
1668 s_end = 4;
1669 s_inc = 4;
1670 }
1671
1672 for (i = row_info->width; i; i--)
1673 {
1674 png_byte v;
1675 int j;
1676
1677 v = (png_byte)((*sp >> sshift) & 0xf);
1678 for (j = 0; j < png_pass_inc[pass]; j++)
1679 {
1680 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1681 *dp |= (png_byte)(v << dshift);
1682 if (dshift == s_end)
1683 {
1684 dshift = s_start;
1685 dp--;
1686 }
1687 else
1688 dshift += s_inc;
1689 }
1690 if (sshift == s_end)
1691 {
1692 sshift = s_start;
1693 sp--;
1694 }
1695 else
1696 sshift += s_inc;
1697 }
1698 break;
1699 }
1700
1701 /*====================================================================*/
1702
1703 default: /* 8-bit or larger (this is where the routine is modified) */
1704 {
1705#if 0
1706// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1708// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1709// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1710#endif
1711 png_bytep sptr, dp;
1712 png_uint_32 i;
1713 png_size_t pixel_bytes;
1714 int width = (int)row_info->width;
1715
1716 pixel_bytes = (row_info->pixel_depth >> 3);
1717
1718 /* point sptr at the last pixel in the pre-expanded row: */
1719 sptr = row + (width - 1) * pixel_bytes;
1720
1721 /* point dp at the last pixel position in the expanded row: */
1722 dp = row + (final_width - 1) * pixel_bytes;
1723
1724 /* New code by Nirav Chhatrapati - Intel Corporation */
1725
1726#if defined(PNG_MMX_CODE_SUPPORTED)
1727#if !defined(PNG_1_0_X)
1728 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1729 /* && _mmx_supported */ )
1730#else
1731 if (_mmx_supported)
1732#endif
1733 {
1734 //--------------------------------------------------------------
1735 if (pixel_bytes == 3)
1736 {
1737 if (((pass == 0) || (pass == 1)) && width)
1738 {
1739 int dummy_value_c; // fix 'forbidden register spilled'
1740 int dummy_value_S;
1741 int dummy_value_D;
1742 int dummy_value_a;
1743
1744 __asm__ __volatile__ (
1745 "subl $21, %%edi \n\t"
1746 // (png_pass_inc[pass] - 1)*pixel_bytes
1747
1748 ".loop3_pass0: \n\t"
1749 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1750 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1751 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1752 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1753 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1754 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1755 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1756 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1757 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1758 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1759 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1760 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1761 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1762 "movq %%mm4, 16(%%edi) \n\t"
1763 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1764 "movq %%mm3, 8(%%edi) \n\t"
1765 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1766 "subl $3, %%esi \n\t"
1767 "movq %%mm0, (%%edi) \n\t"
1768 "subl $24, %%edi \n\t"
1769 "decl %%ecx \n\t"
1770 "jnz .loop3_pass0 \n\t"
1771 "EMMS \n\t" // DONE
1772
1773 : "=c" (dummy_value_c), // output regs (dummy)
1774 "=S" (dummy_value_S),
1775 "=D" (dummy_value_D),
1776 "=a" (dummy_value_a)
1777
1778
1779 : "1" (sptr), // esi // input regs
1780 "2" (dp), // edi
1781 "0" (width), // ecx
1782 "3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
1783
1784#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1785 : "%mm0", "%mm1", "%mm2" // clobber list
1786 , "%mm3", "%mm4"
1787#endif
1788 );
1789 }
1790 else if (((pass == 2) || (pass == 3)) && width)
1791 {
1792 int dummy_value_c; // fix 'forbidden register spilled'
1793 int dummy_value_S;
1794 int dummy_value_D;
1795 int dummy_value_a;
1796
1797 __asm__ __volatile__ (
1798 "subl $9, %%edi \n\t"
1799 // (png_pass_inc[pass] - 1)*pixel_bytes
1800
1801 ".loop3_pass2: \n\t"
1802 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1803 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1804 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1805 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1806 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1807 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1808 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1809 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1810 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1811 "movq %%mm0, 4(%%edi) \n\t"
1812 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1813 "subl $3, %%esi \n\t"
1814 "movd %%mm0, (%%edi) \n\t"
1815 "subl $12, %%edi \n\t"
1816 "decl %%ecx \n\t"
1817 "jnz .loop3_pass2 \n\t"
1818 "EMMS \n\t" // DONE
1819
1820 : "=c" (dummy_value_c), // output regs (dummy)
1821 "=S" (dummy_value_S),
1822 "=D" (dummy_value_D),
1823 "=a" (dummy_value_a)
1824
1825 : "1" (sptr), // esi // input regs
1826 "2" (dp), // edi
1827 "0" (width), // ecx
1828 "3" (&_const4) // (0x0000000000FFFFFFLL)
1829
1830#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1831 : "%mm0", "%mm1", "%mm2" // clobber list
1832#endif
1833 );
1834 }
1835 else if (width) /* && ((pass == 4) || (pass == 5)) */
1836 {
1837 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1838 if (width_mmx < 0)
1839 width_mmx = 0;
1840 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1841 if (width_mmx)
1842 {
1843 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1844 // sptr points at last pixel in pre-expanded row
1845 // dp points at last pixel position in expanded row
1846 int dummy_value_c; // fix 'forbidden register spilled'
1847 int dummy_value_S;
1848 int dummy_value_D;
1849 int dummy_value_a;
1850 int dummy_value_d;
1851
1852 __asm__ __volatile__ (
1853 "subl $3, %%esi \n\t"
1854 "subl $9, %%edi \n\t"
1855 // (png_pass_inc[pass] + 1)*pixel_bytes
1856
1857 ".loop3_pass4: \n\t"
1858 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1859 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1860 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1861 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1862 "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
1863 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1864 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1865 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1866 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1867 "movq %%mm0, (%%edi) \n\t"
1868 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1869 "pand (%4), %%mm3 \n\t" // z z z z z z z 5
1870 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1871 "subl $6, %%esi \n\t"
1872 "movd %%mm2, 8(%%edi) \n\t"
1873 "subl $12, %%edi \n\t"
1874 "subl $2, %%ecx \n\t"
1875 "jnz .loop3_pass4 \n\t"
1876 "EMMS \n\t" // DONE
1877
1878 : "=c" (dummy_value_c), // output regs (dummy)
1879 "=S" (dummy_value_S),
1880 "=D" (dummy_value_D),
1881 "=a" (dummy_value_a),
1882 "=d" (dummy_value_d)
1883
1884 : "1" (sptr), // esi // input regs
1885 "2" (dp), // edi
1886 "0" (width_mmx), // ecx
1887 "3" (&_const4), // 0x0000000000FFFFFFLL
1888 "4" (&_const6) // 0x00000000000000FFLL
1889
1890#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1891 : "%mm0", "%mm1" // clobber list
1892 , "%mm2", "%mm3"
1893#endif
1894 );
1895 }
1896
1897 sptr -= width_mmx*3;
1898 dp -= width_mmx*6;
1899 for (i = width; i; i--)
1900 {
1901 png_byte v[8];
1902 int j;
1903
1904 png_memcpy(v, sptr, 3);
1905 for (j = 0; j < png_pass_inc[pass]; j++)
1906 {
1907 png_memcpy(dp, v, 3);
1908 dp -= 3;
1909 }
1910 sptr -= 3;
1911 }
1912 }
1913 } /* end of pixel_bytes == 3 */
1914
1915 //--------------------------------------------------------------
1916 else if (pixel_bytes == 1)
1917 {
1918 if (((pass == 0) || (pass == 1)) && width)
1919 {
1920 int width_mmx = ((width >> 2) << 2);
1921 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1922 if (width_mmx)
1923 {
1924 int dummy_value_c; // fix 'forbidden register spilled'
1925 int dummy_value_S;
1926 int dummy_value_D;
1927
1928 __asm__ __volatile__ (
1929 "subl $3, %%esi \n\t"
1930 "subl $31, %%edi \n\t"
1931
1932 ".loop1_pass0: \n\t"
1933 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1934 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1935 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1936 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1937 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1938 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1939 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1940 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1941 "movq %%mm0, (%%edi) \n\t"
1942 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1943 "movq %%mm3, 8(%%edi) \n\t"
1944 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1945 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1946 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1947 "movq %%mm2, 16(%%edi) \n\t"
1948 "subl $4, %%esi \n\t"
1949 "movq %%mm4, 24(%%edi) \n\t"
1950 "subl $32, %%edi \n\t"
1951 "subl $4, %%ecx \n\t"
1952 "jnz .loop1_pass0 \n\t"
1953 "EMMS \n\t" // DONE
1954
1955 : "=c" (dummy_value_c), // output regs (dummy)
1956 "=S" (dummy_value_S),
1957 "=D" (dummy_value_D)
1958
1959 : "1" (sptr), // esi // input regs
1960 "2" (dp), // edi
1961 "0" (width_mmx) // ecx
1962
1963#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1964 : "%mm0", "%mm1", "%mm2" // clobber list
1965 , "%mm3", "%mm4"
1966#endif
1967 );
1968 }
1969
1970 sptr -= width_mmx;
1971 dp -= width_mmx*8;
1972 for (i = width; i; i--)
1973 {
1974 int j;
1975
1976 /* I simplified this part in version 1.0.4e
1977 * here and in several other instances where
1978 * pixel_bytes == 1 -- GR-P
1979 *
1980 * Original code:
1981 *
1982 * png_byte v[8];
1983 * png_memcpy(v, sptr, pixel_bytes);
1984 * for (j = 0; j < png_pass_inc[pass]; j++)
1985 * {
1986 * png_memcpy(dp, v, pixel_bytes);
1987 * dp -= pixel_bytes;
1988 * }
1989 * sptr -= pixel_bytes;
1990 *
1991 * Replacement code is in the next three lines:
1992 */
1993
1994 for (j = 0; j < png_pass_inc[pass]; j++)
1995 {
1996 *dp-- = *sptr;
1997 }
1998 --sptr;
1999 }
2000 }
2001 else if (((pass == 2) || (pass == 3)) && width)
2002 {
2003 int width_mmx = ((width >> 2) << 2);
2004 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2005 if (width_mmx)
2006 {
2007 int dummy_value_c; // fix 'forbidden register spilled'
2008 int dummy_value_S;
2009 int dummy_value_D;
2010
2011 __asm__ __volatile__ (
2012 "subl $3, %%esi \n\t"
2013 "subl $15, %%edi \n\t"
2014
2015 ".loop1_pass2: \n\t"
2016 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2017 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2018 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2019 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2020 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2021 "movq %%mm0, (%%edi) \n\t"
2022 "subl $4, %%esi \n\t"
2023 "movq %%mm1, 8(%%edi) \n\t"
2024 "subl $16, %%edi \n\t"
2025 "subl $4, %%ecx \n\t"
2026 "jnz .loop1_pass2 \n\t"
2027 "EMMS \n\t" // DONE
2028
2029 : "=c" (dummy_value_c), // output regs (dummy)
2030 "=S" (dummy_value_S),
2031 "=D" (dummy_value_D)
2032
2033 : "1" (sptr), // esi // input regs
2034 "2" (dp), // edi
2035 "0" (width_mmx) // ecx
2036
2037#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2038 : "%mm0", "%mm1" // clobber list
2039#endif
2040 );
2041 }
2042
2043 sptr -= width_mmx;
2044 dp -= width_mmx*4;
2045 for (i = width; i; i--)
2046 {
2047 int j;
2048
2049 for (j = 0; j < png_pass_inc[pass]; j++)
2050 {
2051 *dp-- = *sptr;
2052 }
2053 --sptr;
2054 }
2055 }
2056 else if (width) /* && ((pass == 4) || (pass == 5)) */
2057 {
2058 int width_mmx = ((width >> 3) << 3);
2059 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2060 if (width_mmx)
2061 {
2062 int dummy_value_c; // fix 'forbidden register spilled'
2063 int dummy_value_S;
2064 int dummy_value_D;
2065
2066 __asm__ __volatile__ (
2067 "subl $7, %%esi \n\t"
2068 "subl $15, %%edi \n\t"
2069
2070 ".loop1_pass4: \n\t"
2071 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2072 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2073 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2074 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2075 "movq %%mm1, 8(%%edi) \n\t"
2076 "subl $8, %%esi \n\t"
2077 "movq %%mm0, (%%edi) \n\t"
2078 "subl $16, %%edi \n\t"
2079 "subl $8, %%ecx \n\t"
2080 "jnz .loop1_pass4 \n\t"
2081 "EMMS \n\t" // DONE
2082
2083 : "=c" (dummy_value_c), // output regs (none)
2084 "=S" (dummy_value_S),
2085 "=D" (dummy_value_D)
2086
2087 : "1" (sptr), // esi // input regs
2088 "2" (dp), // edi
2089 "0" (width_mmx) // ecx
2090
2091#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2092 : "%mm0", "%mm1" // clobber list
2093#endif
2094 );
2095 }
2096
2097 sptr -= width_mmx;
2098 dp -= width_mmx*2;
2099 for (i = width; i; i--)
2100 {
2101 int j;
2102
2103 for (j = 0; j < png_pass_inc[pass]; j++)
2104 {
2105 *dp-- = *sptr;
2106 }
2107 --sptr;
2108 }
2109 }
2110 } /* end of pixel_bytes == 1 */
2111
2112 //--------------------------------------------------------------
2113 else if (pixel_bytes == 2)
2114 {
2115 if (((pass == 0) || (pass == 1)) && width)
2116 {
2117 int width_mmx = ((width >> 1) << 1);
2118 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2119 if (width_mmx)
2120 {
2121 int dummy_value_c; // fix 'forbidden register spilled'
2122 int dummy_value_S;
2123 int dummy_value_D;
2124
2125 __asm__ __volatile__ (
2126 "subl $2, %%esi \n\t"
2127 "subl $30, %%edi \n\t"
2128
2129 ".loop2_pass0: \n\t"
2130 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2131 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2132 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2133 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2134 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2135 "movq %%mm0, (%%edi) \n\t"
2136 "movq %%mm0, 8(%%edi) \n\t"
2137 "movq %%mm1, 16(%%edi) \n\t"
2138 "subl $4, %%esi \n\t"
2139 "movq %%mm1, 24(%%edi) \n\t"
2140 "subl $32, %%edi \n\t"
2141 "subl $2, %%ecx \n\t"
2142 "jnz .loop2_pass0 \n\t"
2143 "EMMS \n\t" // DONE
2144
2145 : "=c" (dummy_value_c), // output regs (dummy)
2146 "=S" (dummy_value_S),
2147 "=D" (dummy_value_D)
2148
2149 : "1" (sptr), // esi // input regs
2150 "2" (dp), // edi
2151 "0" (width_mmx) // ecx
2152
2153#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2154 : "%mm0", "%mm1" // clobber list
2155#endif
2156 );
2157 }
2158
2159 sptr -= (width_mmx*2 - 2); // sign fixed
2160 dp -= (width_mmx*16 - 2); // sign fixed
2161 for (i = width; i; i--)
2162 {
2163 png_byte v[8];
2164 int j;
2165 sptr -= 2;
2166 png_memcpy(v, sptr, 2);
2167 for (j = 0; j < png_pass_inc[pass]; j++)
2168 {
2169 dp -= 2;
2170 png_memcpy(dp, v, 2);
2171 }
2172 }
2173 }
2174 else if (((pass == 2) || (pass == 3)) && width)
2175 {
2176 int width_mmx = ((width >> 1) << 1) ;
2177 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2178 if (width_mmx)
2179 {
2180 int dummy_value_c; // fix 'forbidden register spilled'
2181 int dummy_value_S;
2182 int dummy_value_D;
2183
2184 __asm__ __volatile__ (
2185 "subl $2, %%esi \n\t"
2186 "subl $14, %%edi \n\t"
2187
2188 ".loop2_pass2: \n\t"
2189 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2190 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2191 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2192 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2193 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2194 "movq %%mm0, (%%edi) \n\t"
2195 "subl $4, %%esi \n\t"
2196 "movq %%mm1, 8(%%edi) \n\t"
2197 "subl $16, %%edi \n\t"
2198 "subl $2, %%ecx \n\t"
2199 "jnz .loop2_pass2 \n\t"
2200 "EMMS \n\t" // DONE
2201
2202 : "=c" (dummy_value_c), // output regs (dummy)
2203 "=S" (dummy_value_S),
2204 "=D" (dummy_value_D)
2205
2206 : "1" (sptr), // esi // input regs
2207 "2" (dp), // edi
2208 "0" (width_mmx) // ecx
2209
2210#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2211 : "%mm0", "%mm1" // clobber list
2212#endif
2213 );
2214 }
2215
2216 sptr -= (width_mmx*2 - 2); // sign fixed
2217 dp -= (width_mmx*8 - 2); // sign fixed
2218 for (i = width; i; i--)
2219 {
2220 png_byte v[8];
2221 int j;
2222 sptr -= 2;
2223 png_memcpy(v, sptr, 2);
2224 for (j = 0; j < png_pass_inc[pass]; j++)
2225 {
2226 dp -= 2;
2227 png_memcpy(dp, v, 2);
2228 }
2229 }
2230 }
2231 else if (width) // pass == 4 or 5
2232 {
2233 int width_mmx = ((width >> 1) << 1) ;
2234 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2235 if (width_mmx)
2236 {
2237 int dummy_value_c; // fix 'forbidden register spilled'
2238 int dummy_value_S;
2239 int dummy_value_D;
2240
2241 __asm__ __volatile__ (
2242 "subl $2, %%esi \n\t"
2243 "subl $6, %%edi \n\t"
2244
2245 ".loop2_pass4: \n\t"
2246 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2247 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2248 "subl $4, %%esi \n\t"
2249 "movq %%mm0, (%%edi) \n\t"
2250 "subl $8, %%edi \n\t"
2251 "subl $2, %%ecx \n\t"
2252 "jnz .loop2_pass4 \n\t"
2253 "EMMS \n\t" // DONE
2254
2255 : "=c" (dummy_value_c), // output regs (dummy)
2256 "=S" (dummy_value_S),
2257 "=D" (dummy_value_D)
2258
2259 : "1" (sptr), // esi // input regs
2260 "2" (dp), // edi
2261 "0" (width_mmx) // ecx
2262
2263#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2264 : "%mm0" // clobber list
2265#endif
2266 );
2267 }
2268
2269 sptr -= (width_mmx*2 - 2); // sign fixed
2270 dp -= (width_mmx*4 - 2); // sign fixed
2271 for (i = width; i; i--)
2272 {
2273 png_byte v[8];
2274 int j;
2275 sptr -= 2;
2276 png_memcpy(v, sptr, 2);
2277 for (j = 0; j < png_pass_inc[pass]; j++)
2278 {
2279 dp -= 2;
2280 png_memcpy(dp, v, 2);
2281 }
2282 }
2283 }
2284 } /* end of pixel_bytes == 2 */
2285
2286 //--------------------------------------------------------------
2287 else if (pixel_bytes == 4)
2288 {
2289 if (((pass == 0) || (pass == 1)) && width)
2290 {
2291 int width_mmx = ((width >> 1) << 1);
2292 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2293 if (width_mmx)
2294 {
2295 int dummy_value_c; // fix 'forbidden register spilled'
2296 int dummy_value_S;
2297 int dummy_value_D;
2298
2299 __asm__ __volatile__ (
2300 "subl $4, %%esi \n\t"
2301 "subl $60, %%edi \n\t"
2302
2303 ".loop4_pass0: \n\t"
2304 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2305 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2306 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2307 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2308 "movq %%mm0, (%%edi) \n\t"
2309 "movq %%mm0, 8(%%edi) \n\t"
2310 "movq %%mm0, 16(%%edi) \n\t"
2311 "movq %%mm0, 24(%%edi) \n\t"
2312 "movq %%mm1, 32(%%edi) \n\t"
2313 "movq %%mm1, 40(%%edi) \n\t"
2314 "movq %%mm1, 48(%%edi) \n\t"
2315 "subl $8, %%esi \n\t"
2316 "movq %%mm1, 56(%%edi) \n\t"
2317 "subl $64, %%edi \n\t"
2318 "subl $2, %%ecx \n\t"
2319 "jnz .loop4_pass0 \n\t"
2320 "EMMS \n\t" // DONE
2321
2322 : "=c" (dummy_value_c), // output regs (dummy)
2323 "=S" (dummy_value_S),
2324 "=D" (dummy_value_D)
2325
2326 : "1" (sptr), // esi // input regs
2327 "2" (dp), // edi
2328 "0" (width_mmx) // ecx
2329
2330#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2331 : "%mm0", "%mm1" // clobber list
2332#endif
2333 );
2334 }
2335
2336 sptr -= (width_mmx*4 - 4); // sign fixed
2337 dp -= (width_mmx*32 - 4); // sign fixed
2338 for (i = width; i; i--)
2339 {
2340 png_byte v[8];
2341 int j;
2342 sptr -= 4;
2343 png_memcpy(v, sptr, 4);
2344 for (j = 0; j < png_pass_inc[pass]; j++)
2345 {
2346 dp -= 4;
2347 png_memcpy(dp, v, 4);
2348 }
2349 }
2350 }
2351 else if (((pass == 2) || (pass == 3)) && width)
2352 {
2353 int width_mmx = ((width >> 1) << 1);
2354 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2355 if (width_mmx)
2356 {
2357 int dummy_value_c; // fix 'forbidden register spilled'
2358 int dummy_value_S;
2359 int dummy_value_D;
2360
2361 __asm__ __volatile__ (
2362 "subl $4, %%esi \n\t"
2363 "subl $28, %%edi \n\t"
2364
2365 ".loop4_pass2: \n\t"
2366 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2367 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2368 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2369 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2370 "movq %%mm0, (%%edi) \n\t"
2371 "movq %%mm0, 8(%%edi) \n\t"
2372 "movq %%mm1, 16(%%edi) \n\t"
2373 "movq %%mm1, 24(%%edi) \n\t"
2374 "subl $8, %%esi \n\t"
2375 "subl $32, %%edi \n\t"
2376 "subl $2, %%ecx \n\t"
2377 "jnz .loop4_pass2 \n\t"
2378 "EMMS \n\t" // DONE
2379
2380 : "=c" (dummy_value_c), // output regs (dummy)
2381 "=S" (dummy_value_S),
2382 "=D" (dummy_value_D)
2383
2384 : "1" (sptr), // esi // input regs
2385 "2" (dp), // edi
2386 "0" (width_mmx) // ecx
2387
2388#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2389 : "%mm0", "%mm1" // clobber list
2390#endif
2391 );
2392 }
2393
2394 sptr -= (width_mmx*4 - 4); // sign fixed
2395 dp -= (width_mmx*16 - 4); // sign fixed
2396 for (i = width; i; i--)
2397 {
2398 png_byte v[8];
2399 int j;
2400 sptr -= 4;
2401 png_memcpy(v, sptr, 4);
2402 for (j = 0; j < png_pass_inc[pass]; j++)
2403 {
2404 dp -= 4;
2405 png_memcpy(dp, v, 4);
2406 }
2407 }
2408 }
2409 else if (width) // pass == 4 or 5
2410 {
2411 int width_mmx = ((width >> 1) << 1) ;
2412 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2413 if (width_mmx)
2414 {
2415 int dummy_value_c; // fix 'forbidden register spilled'
2416 int dummy_value_S;
2417 int dummy_value_D;
2418
2419 __asm__ __volatile__ (
2420 "subl $4, %%esi \n\t"
2421 "subl $12, %%edi \n\t"
2422
2423 ".loop4_pass4: \n\t"
2424 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2425 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2426 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2427 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2428 "movq %%mm0, (%%edi) \n\t"
2429 "subl $8, %%esi \n\t"
2430 "movq %%mm1, 8(%%edi) \n\t"
2431 "subl $16, %%edi \n\t"
2432 "subl $2, %%ecx \n\t"
2433 "jnz .loop4_pass4 \n\t"
2434 "EMMS \n\t" // DONE
2435
2436 : "=c" (dummy_value_c), // output regs (dummy)
2437 "=S" (dummy_value_S),
2438 "=D" (dummy_value_D)
2439
2440 : "1" (sptr), // esi // input regs
2441 "2" (dp), // edi
2442 "0" (width_mmx) // ecx
2443
2444#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2445 : "%mm0", "%mm1" // clobber list
2446#endif
2447 );
2448 }
2449
2450 sptr -= (width_mmx*4 - 4); // sign fixed
2451 dp -= (width_mmx*8 - 4); // sign fixed
2452 for (i = width; i; i--)
2453 {
2454 png_byte v[8];
2455 int j;
2456 sptr -= 4;
2457 png_memcpy(v, sptr, 4);
2458 for (j = 0; j < png_pass_inc[pass]; j++)
2459 {
2460 dp -= 4;
2461 png_memcpy(dp, v, 4);
2462 }
2463 }
2464 }
2465 } /* end of pixel_bytes == 4 */
2466
2467 //--------------------------------------------------------------
2468 else if (pixel_bytes == 8)
2469 {
2470// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2471 // GRR NOTE: no need to combine passes here!
2472 if (((pass == 0) || (pass == 1)) && width)
2473 {
2474 int dummy_value_c; // fix 'forbidden register spilled'
2475 int dummy_value_S;
2476 int dummy_value_D;
2477
2478 // source is 8-byte RRGGBBAA
2479 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2480 __asm__ __volatile__ (
2481 "subl $56, %%edi \n\t" // start of last block
2482
2483 ".loop8_pass0: \n\t"
2484 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2485 "movq %%mm0, (%%edi) \n\t"
2486 "movq %%mm0, 8(%%edi) \n\t"
2487 "movq %%mm0, 16(%%edi) \n\t"
2488 "movq %%mm0, 24(%%edi) \n\t"
2489 "movq %%mm0, 32(%%edi) \n\t"
2490 "movq %%mm0, 40(%%edi) \n\t"
2491 "movq %%mm0, 48(%%edi) \n\t"
2492 "subl $8, %%esi \n\t"
2493 "movq %%mm0, 56(%%edi) \n\t"
2494 "subl $64, %%edi \n\t"
2495 "decl %%ecx \n\t"
2496 "jnz .loop8_pass0 \n\t"
2497 "EMMS \n\t" // DONE
2498
2499 : "=c" (dummy_value_c), // output regs (dummy)
2500 "=S" (dummy_value_S),
2501 "=D" (dummy_value_D)
2502
2503 : "1" (sptr), // esi // input regs
2504 "2" (dp), // edi
2505 "0" (width) // ecx
2506
2507#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2508 : "%mm0" // clobber list
2509#endif
2510 );
2511 }
2512 else if (((pass == 2) || (pass == 3)) && width)
2513 {
2514 // source is 8-byte RRGGBBAA
2515 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2516 // (recall that expansion is _in place_: sptr and dp
2517 // both point at locations within same row buffer)
2518 {
2519 int dummy_value_c; // fix 'forbidden register spilled'
2520 int dummy_value_S;
2521 int dummy_value_D;
2522
2523 __asm__ __volatile__ (
2524 "subl $24, %%edi \n\t" // start of last block
2525
2526 ".loop8_pass2: \n\t"
2527 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2528 "movq %%mm0, (%%edi) \n\t"
2529 "movq %%mm0, 8(%%edi) \n\t"
2530 "movq %%mm0, 16(%%edi) \n\t"
2531 "subl $8, %%esi \n\t"
2532 "movq %%mm0, 24(%%edi) \n\t"
2533 "subl $32, %%edi \n\t"
2534 "decl %%ecx \n\t"
2535 "jnz .loop8_pass2 \n\t"
2536 "EMMS \n\t" // DONE
2537
2538 : "=c" (dummy_value_c), // output regs (dummy)
2539 "=S" (dummy_value_S),
2540 "=D" (dummy_value_D)
2541
2542 : "1" (sptr), // esi // input regs
2543 "2" (dp), // edi
2544 "0" (width) // ecx
2545
2546#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2547 : "%mm0" // clobber list
2548#endif
2549 );
2550 }
2551 }
2552 else if (width) // pass == 4 or 5
2553 {
2554 // source is 8-byte RRGGBBAA
2555 // dest is 16-byte RRGGBBAA RRGGBBAA
2556 {
2557 int dummy_value_c; // fix 'forbidden register spilled'
2558 int dummy_value_S;
2559 int dummy_value_D;
2560
2561 __asm__ __volatile__ (
2562 "subl $8, %%edi \n\t" // start of last block
2563
2564 ".loop8_pass4: \n\t"
2565 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2566 "movq %%mm0, (%%edi) \n\t"
2567 "subl $8, %%esi \n\t"
2568 "movq %%mm0, 8(%%edi) \n\t"
2569 "subl $16, %%edi \n\t"
2570 "decl %%ecx \n\t"
2571 "jnz .loop8_pass4 \n\t"
2572 "EMMS \n\t" // DONE
2573
2574 : "=c" (dummy_value_c), // output regs (dummy)
2575 "=S" (dummy_value_S),
2576 "=D" (dummy_value_D)
2577
2578 : "1" (sptr), // esi // input regs
2579 "2" (dp), // edi
2580 "0" (width) // ecx
2581
2582#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2583 : "%mm0" // clobber list
2584#endif
2585 );
2586 }
2587 }
2588
2589 } /* end of pixel_bytes == 8 */
2590
2591 //--------------------------------------------------------------
2592 else if (pixel_bytes == 6)
2593 {
2594 for (i = width; i; i--)
2595 {
2596 png_byte v[8];
2597 int j;
2598 png_memcpy(v, sptr, 6);
2599 for (j = 0; j < png_pass_inc[pass]; j++)
2600 {
2601 png_memcpy(dp, v, 6);
2602 dp -= 6;
2603 }
2604 sptr -= 6;
2605 }
2606 } /* end of pixel_bytes == 6 */
2607
2608 //--------------------------------------------------------------
2609 else
2610 {
2611 for (i = width; i; i--)
2612 {
2613 png_byte v[8];
2614 int j;
2615 png_memcpy(v, sptr, pixel_bytes);
2616 for (j = 0; j < png_pass_inc[pass]; j++)
2617 {
2618 png_memcpy(dp, v, pixel_bytes);
2619 dp -= pixel_bytes;
2620 }
2621 sptr-= pixel_bytes;
2622 }
2623 }
2624 } // end of _mmx_supported ========================================
2625
2626 else /* MMX not supported: use modified C code - takes advantage
2627 * of inlining of png_memcpy for a constant */
2628 /* GRR 19991007: does it? or should pixel_bytes in each
2629 * block be replaced with immediate value (e.g., 1)? */
2630 /* GRR 19991017: replaced with constants in each case */
2631#endif /* PNG_MMX_CODE_SUPPORTED */
2632 {
2633 if (pixel_bytes == 1)
2634 {
2635 for (i = width; i; i--)
2636 {
2637 int j;
2638 for (j = 0; j < png_pass_inc[pass]; j++)
2639 {
2640 *dp-- = *sptr;
2641 }
2642 --sptr;
2643 }
2644 }
2645 else if (pixel_bytes == 3)
2646 {
2647 for (i = width; i; i--)
2648 {
2649 png_byte v[8];
2650 int j;
2651 png_memcpy(v, sptr, 3);
2652 for (j = 0; j < png_pass_inc[pass]; j++)
2653 {
2654 png_memcpy(dp, v, 3);
2655 dp -= 3;
2656 }
2657 sptr -= 3;
2658 }
2659 }
2660 else if (pixel_bytes == 2)
2661 {
2662 for (i = width; i; i--)
2663 {
2664 png_byte v[8];
2665 int j;
2666 png_memcpy(v, sptr, 2);
2667 for (j = 0; j < png_pass_inc[pass]; j++)
2668 {
2669 png_memcpy(dp, v, 2);
2670 dp -= 2;
2671 }
2672 sptr -= 2;
2673 }
2674 }
2675 else if (pixel_bytes == 4)
2676 {
2677 for (i = width; i; i--)
2678 {
2679 png_byte v[8];
2680 int j;
2681 png_memcpy(v, sptr, 4);
2682 for (j = 0; j < png_pass_inc[pass]; j++)
2683 {
2684#ifdef PNG_DEBUG
2685 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2686 {
2687 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2688 row, dp, row+png_ptr->row_buf_size);
2689 printf("row_buf=%d\n",png_ptr->row_buf_size);
2690 }
2691#endif
2692 png_memcpy(dp, v, 4);
2693 dp -= 4;
2694 }
2695 sptr -= 4;
2696 }
2697 }
2698 else if (pixel_bytes == 6)
2699 {
2700 for (i = width; i; i--)
2701 {
2702 png_byte v[8];
2703 int j;
2704 png_memcpy(v, sptr, 6);
2705 for (j = 0; j < png_pass_inc[pass]; j++)
2706 {
2707 png_memcpy(dp, v, 6);
2708 dp -= 6;
2709 }
2710 sptr -= 6;
2711 }
2712 }
2713 else if (pixel_bytes == 8)
2714 {
2715 for (i = width; i; i--)
2716 {
2717 png_byte v[8];
2718 int j;
2719 png_memcpy(v, sptr, 8);
2720 for (j = 0; j < png_pass_inc[pass]; j++)
2721 {
2722 png_memcpy(dp, v, 8);
2723 dp -= 8;
2724 }
2725 sptr -= 8;
2726 }
2727 }
2728 else /* GRR: should never be reached */
2729 {
2730 for (i = width; i; i--)
2731 {
2732 png_byte v[8];
2733 int j;
2734 png_memcpy(v, sptr, pixel_bytes);
2735 for (j = 0; j < png_pass_inc[pass]; j++)
2736 {
2737 png_memcpy(dp, v, pixel_bytes);
2738 dp -= pixel_bytes;
2739 }
2740 sptr -= pixel_bytes;
2741 }
2742 }
2743
2744 } /* end if (MMX not supported) */
2745 break;
2746 }
2747 } /* end switch (row_info->pixel_depth) */
2748
2749 row_info->width = final_width;
2750
2751 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2752 }
2753
2754} /* end png_do_read_interlace() */
2755
2756#endif /* PNG_HAVE_MMX_READ_INTERLACE */
2757#endif /* PNG_READ_INTERLACING_SUPPORTED */
2758
2759
2760
2761#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2762#if defined(PNG_MMX_CODE_SUPPORTED)
2763
2764// These variables are utilized in the functions below. They are declared
2765// globally here to ensure alignment on 8-byte boundaries.
2766
2767union uAll {
2768 long long use;
2769 double align;
2770} _LBCarryMask = {0x0101010101010101LL},
2771 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2772 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2773
2774#ifdef PNG_THREAD_UNSAFE_OK
2775//===========================================================================//
2776// //
2777// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2778// //
2779//===========================================================================//
2780
2781// Optimized code for PNG Average filter decoder
2782
2783static void /* PRIVATE */
2784png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2785 png_bytep prev_row)
2786{
2787 int bpp;
2788 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2789 int dummy_value_S;
2790 int dummy_value_D;
2791
2792 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2793 _FullLength = row_info->rowbytes; // # of bytes to filter
2794
2795 __asm__ __volatile__ (
2796 // initialize address pointers and offset
2797#ifdef __PIC__
2798 "pushl %%ebx \n\t" // save index to Global Offset Table
2799#endif
2800//pre "movl row, %%edi \n\t" // edi: Avg(x)
2801 "xorl %%ebx, %%ebx \n\t" // ebx: x
2802 "movl %%edi, %%edx \n\t"
2803//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2804//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2805 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2806
2807 "xorl %%eax,%%eax \n\t"
2808
2809 // Compute the Raw value for the first bpp bytes
2810 // Raw(x) = Avg(x) + (Prior(x)/2)
2811 "avg_rlp: \n\t"
2812 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2813 "incl %%ebx \n\t"
2814 "shrb %%al \n\t" // divide by 2
2815 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2816//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2817 "cmpl %%ecx, %%ebx \n\t"
2818 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2819 "jb avg_rlp \n\t" // mov does not affect flags
2820
2821 // get # of bytes to alignment
2822 "movl %%edi, _dif \n\t" // take start of row
2823 "addl %%ebx, _dif \n\t" // add bpp
2824 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2825 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2826 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2827 "jz avg_go \n\t" // alignment
2828
2829 // fix alignment
2830 // Compute the Raw value for the bytes up to the alignment boundary
2831 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2832 "xorl %%ecx, %%ecx \n\t"
2833
2834 "avg_lp1: \n\t"
2835 "xorl %%eax, %%eax \n\t"
2836 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2837 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2838 "addw %%cx, %%ax \n\t"
2839 "incl %%ebx \n\t"
2840 "shrw %%ax \n\t" // divide by 2
2841 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2842 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2843 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2844 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2845
2846 "avg_go: \n\t"
2847 "movl _FullLength, %%eax \n\t"
2848 "movl %%eax, %%ecx \n\t"
2849 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2850 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2851 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2852 "movl %%ecx, _MMXLength \n\t"
2853#ifdef __PIC__
2854 "popl %%ebx \n\t" // restore index to Global Offset Table
2855#endif
2856
2857 : "=c" (dummy_value_c), // output regs (dummy)
2858 "=S" (dummy_value_S),
2859 "=D" (dummy_value_D)
2860
2861 : "0" (bpp), // ecx // input regs
2862 "1" (prev_row), // esi
2863 "2" (row) // edi
2864
2865 : "%eax", "%edx" // clobber list
2866#ifndef __PIC__
2867 , "%ebx"
2868#endif
2869 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2870 // (seems to work fine without...)
2871 );
2872
2873 // now do the math for the rest of the row
2874 switch (bpp)
2875 {
2876 case 3:
2877 {
2878 _ActiveMask.use = 0x0000000000ffffffLL;
2879 _ShiftBpp.use = 24; // == 3 * 8
2880 _ShiftRem.use = 40; // == 64 - 24
2881
2882 __asm__ __volatile__ (
2883 // re-init address pointers and offset
2884 "movq _ActiveMask, %%mm7 \n\t"
2885 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2886 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2887// preload "movl row, %%edi \n\t" // edi: Avg(x)
2888 "movq _HBClearMask, %%mm4 \n\t"
2889// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2890
2891 // prime the pump: load the first Raw(x-bpp) data set
2892 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2893 // (correct pos. in loop below)
2894 "avg_3lp: \n\t"
2895 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2896 "movq %%mm5, %%mm3 \n\t"
2897 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2898 // data
2899 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2900 "movq %%mm7, %%mm6 \n\t"
2901 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2902 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2903 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2904 // byte
2905 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2906 // each byte
2907 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2908 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2909 // LBCarrys
2910 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2911 // where both
2912 // lsb's were == 1 (only valid for active group)
2913 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2914 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2915 // byte
2916 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2917 // for each byte
2918 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2919 // bytes to add to Avg
2920 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921 // Avg for each Active
2922 // byte
2923 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2924 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2925 // bytes 3-5
2926 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2927 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2928 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2929 // LBCarrys
2930 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2931 // where both
2932 // lsb's were == 1 (only valid for active group)
2933 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2934 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2935 // byte
2936 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2937 // for each byte
2938 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2939 // bytes to add to Avg
2940 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2941 // Avg for each Active
2942 // byte
2943
2944 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2945 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2946 // two
2947 // bytes
2948 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2949 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2950 // Data only needs to be shifted once here to
2951 // get the correct x-bpp offset.
2952 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2953 // LBCarrys
2954 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2955 // where both
2956 // lsb's were == 1 (only valid for active group)
2957 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2958 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2959 // byte
2960 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2961 // for each byte
2962 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2963 // bytes to add to Avg
2964 "addl $8, %%ecx \n\t"
2965 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2966 // Avg for each Active
2967 // byte
2968 // now ready to write back to memory
2969 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2970 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2971 "cmpl _MMXLength, %%ecx \n\t"
2972 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2973 "jb avg_3lp \n\t"
2974
2975 : "=S" (dummy_value_S), // output regs (dummy)
2976 "=D" (dummy_value_D)
2977
2978 : "0" (prev_row), // esi // input regs
2979 "1" (row) // edi
2980
2981 : "%ecx" // clobber list
2982#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2983 , "%mm0", "%mm1", "%mm2", "%mm3"
2984 , "%mm4", "%mm5", "%mm6", "%mm7"
2985#endif
2986 );
2987 }
2988 break; // end 3 bpp
2989
2990 case 6:
2991 case 4:
2992 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2993 //case 5: // GRR BOGUS
2994 {
2995 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2996 // appropriate inactive bytes
2997 _ShiftBpp.use = bpp << 3;
2998 _ShiftRem.use = 64 - _ShiftBpp.use;
2999
3000 __asm__ __volatile__ (
3001 "movq _HBClearMask, %%mm4 \n\t"
3002
3003 // re-init address pointers and offset
3004 "movl _dif, %%ecx \n\t" // ecx: x = offset to
3005 // alignment boundary
3006
3007 // load _ActiveMask and clear all bytes except for 1st active group
3008 "movq _ActiveMask, %%mm7 \n\t"
3009// preload "movl row, %%edi \n\t" // edi: Avg(x)
3010 "psrlq _ShiftRem, %%mm7 \n\t"
3011// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3012 "movq %%mm7, %%mm6 \n\t"
3013 "movq _LBCarryMask, %%mm5 \n\t"
3014 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3015 // group
3016
3017 // prime the pump: load the first Raw(x-bpp) data set
3018 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3019 // (we correct pos. in loop below)
3020 "avg_4lp: \n\t"
3021 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3022 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3023 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3024 // add (Prev_row/2) to average
3025 "movq %%mm5, %%mm3 \n\t"
3026 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3027 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3028 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3029 // byte
3030 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3031 // each byte
3032 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3033 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3034 // LBCarrys
3035 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3036 // where both
3037 // lsb's were == 1 (only valid for active group)
3038 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3039 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3040 // byte
3041 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3042 // for each byte
3043 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3044 // bytes to add to Avg
3045 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3046 // for each Active
3047 // byte
3048 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3049 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3050 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3051 "addl $8, %%ecx \n\t"
3052 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3053 // LBCarrys
3054 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3055 // where both
3056 // lsb's were == 1 (only valid for active group)
3057 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3058 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3059 // byte
3060 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3061 // for each byte
3062 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3063 // bytes to add to Avg
3064 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3065 // Avg for each Active
3066 // byte
3067 "cmpl _MMXLength, %%ecx \n\t"
3068 // now ready to write back to memory
3069 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3070 // prep Raw(x-bpp) for next loop
3071 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3072 "jb avg_4lp \n\t"
3073
3074 : "=S" (dummy_value_S), // output regs (dummy)
3075 "=D" (dummy_value_D)
3076
3077 : "0" (prev_row), // esi // input regs
3078 "1" (row) // edi
3079
3080 : "%ecx" // clobber list
3081#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3082 , "%mm0", "%mm1", "%mm2", "%mm3"
3083 , "%mm4", "%mm5", "%mm6", "%mm7"
3084#endif
3085 );
3086 }
3087 break; // end 4,6 bpp
3088
3089 case 2:
3090 {
3091 _ActiveMask.use = 0x000000000000ffffLL;
3092 _ShiftBpp.use = 16; // == 2 * 8
3093 _ShiftRem.use = 48; // == 64 - 16
3094
3095 __asm__ __volatile__ (
3096 // load _ActiveMask
3097 "movq _ActiveMask, %%mm7 \n\t"
3098 // re-init address pointers and offset
3099 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3100 // boundary
3101 "movq _LBCarryMask, %%mm5 \n\t"
3102// preload "movl row, %%edi \n\t" // edi: Avg(x)
3103 "movq _HBClearMask, %%mm4 \n\t"
3104// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3105
3106 // prime the pump: load the first Raw(x-bpp) data set
3107 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3108 // (we correct pos. in loop below)
3109 "avg_2lp: \n\t"
3110 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3111 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3112 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3113 // add (Prev_row/2) to average
3114 "movq %%mm5, %%mm3 \n\t"
3115 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3116 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3117 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3118 // byte
3119 "movq %%mm7, %%mm6 \n\t"
3120 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3121 // each byte
3122
3123 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3124 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3125 // LBCarrys
3126 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3127 // where both
3128 // lsb's were == 1 (only valid
3129 // for active group)
3130 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3131 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3132 // byte
3133 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3134 // for each byte
3135 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3136 // bytes to add to Avg
3137 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3138 // for each Active byte
3139
3140 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3141 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3142 // bytes 2 & 3
3143 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3144 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3145 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3146 // LBCarrys
3147 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3148 // where both
3149 // lsb's were == 1 (only valid
3150 // for active group)
3151 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3152 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3153 // byte
3154 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3155 // for each byte
3156 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3157 // bytes to add to Avg
3158 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3159 // Avg for each Active byte
3160
3161 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3162 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3163 // bytes 4 & 5
3164 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3165 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3166 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3167 // LBCarrys
3168 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3169 // where both lsb's were == 1
3170 // (only valid for active group)
3171 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3172 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3173 // byte
3174 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3175 // for each byte
3176 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3177 // bytes to add to Avg
3178 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3179 // Avg for each Active byte
3180
3181 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3182 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3183 // bytes 6 & 7
3184 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3185 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3186 "addl $8, %%ecx \n\t"
3187 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3188 // LBCarrys
3189 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3190 // where both
3191 // lsb's were == 1 (only valid
3192 // for active group)
3193 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3194 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3195 // byte
3196 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3197 // for each byte
3198 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3199 // bytes to add to Avg
3200 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3201 // Avg for each Active byte
3202
3203 "cmpl _MMXLength, %%ecx \n\t"
3204 // now ready to write back to memory
3205 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3206 // prep Raw(x-bpp) for next loop
3207 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3208 "jb avg_2lp \n\t"
3209
3210 : "=S" (dummy_value_S), // output regs (dummy)
3211 "=D" (dummy_value_D)
3212
3213 : "0" (prev_row), // esi // input regs
3214 "1" (row) // edi
3215
3216 : "%ecx" // clobber list
3217#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3218 , "%mm0", "%mm1", "%mm2", "%mm3"
3219 , "%mm4", "%mm5", "%mm6", "%mm7"
3220#endif
3221 );
3222 }
3223 break; // end 2 bpp
3224
3225 case 1:
3226 {
3227 __asm__ __volatile__ (
3228 // re-init address pointers and offset
3229#ifdef __PIC__
3230 "pushl %%ebx \n\t" // save Global Offset Table index
3231#endif
3232 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3233 // boundary
3234// preload "movl row, %%edi \n\t" // edi: Avg(x)
3235 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3236 "jnb avg_1end \n\t"
3237 // do Paeth decode for remaining bytes
3238// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3239 "movl %%edi, %%edx \n\t"
3240// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3241 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3242 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3243 // in loop below
3244 "avg_1lp: \n\t"
3245 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3246 "xorl %%eax, %%eax \n\t"
3247 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3248 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3249 "addw %%cx, %%ax \n\t"
3250 "incl %%ebx \n\t"
3251 "shrw %%ax \n\t" // divide by 2
3252 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3253 // inc ebx
3254 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3255 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3256 // mov does not affect flags; -1 to offset inc ebx
3257 "jb avg_1lp \n\t"
3258
3259 "avg_1end: \n\t"
3260#ifdef __PIC__
3261 "popl %%ebx \n\t" // Global Offset Table index
3262#endif
3263
3264 : "=c" (dummy_value_c), // output regs (dummy)
3265 "=S" (dummy_value_S),
3266 "=D" (dummy_value_D)
3267
3268 : "0" (bpp), // ecx // input regs
3269 "1" (prev_row), // esi
3270 "2" (row) // edi
3271
3272 : "%eax", "%edx" // clobber list
3273#ifndef __PIC__
3274 , "%ebx"
3275#endif
3276 );
3277 }
3278 return; // end 1 bpp
3279
3280 case 8:
3281 {
3282 __asm__ __volatile__ (
3283 // re-init address pointers and offset
3284 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3285 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3286// preload "movl row, %%edi \n\t" // edi: Avg(x)
3287 "movq _HBClearMask, %%mm4 \n\t"
3288// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3289
3290 // prime the pump: load the first Raw(x-bpp) data set
3291 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3292 // (NO NEED to correct pos. in loop below)
3293
3294 "avg_8lp: \n\t"
3295 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3296 "movq %%mm5, %%mm3 \n\t"
3297 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3298 "addl $8, %%ecx \n\t"
3299 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3300 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3301 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3302 // where both lsb's were == 1
3303 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3304 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3305 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3306 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3307 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3308 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3309 "cmpl _MMXLength, %%ecx \n\t"
3310 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3311 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3312 "jb avg_8lp \n\t"
3313
3314 : "=S" (dummy_value_S), // output regs (dummy)
3315 "=D" (dummy_value_D)
3316
3317 : "0" (prev_row), // esi // input regs
3318 "1" (row) // edi
3319
3320 : "%ecx" // clobber list
3321#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3322 , "%mm0", "%mm1", "%mm2"
3323 , "%mm3", "%mm4", "%mm5"
3324#endif
3325 );
3326 }
3327 break; // end 8 bpp
3328
3329 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3330 {
3331
3332#ifdef PNG_DEBUG
3333 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3334 png_debug(1,
3335 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3336#endif
3337
3338#if 0
3339 __asm__ __volatile__ (
3340 "movq _LBCarryMask, %%mm5 \n\t"
3341 // re-init address pointers and offset
3342 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3343 // alignment boundary
3344 "movl row, %%edi \n\t" // edi: Avg(x)
3345 "movq _HBClearMask, %%mm4 \n\t"
3346 "movl %%edi, %%edx \n\t"
3347 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3348 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3349 "avg_Alp: \n\t"
3350 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3351 "movq %%mm5, %%mm3 \n\t"
3352 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3353 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3354 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3355 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3356 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3357 // where both lsb's were == 1
3358 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3359 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3360 // byte
3361 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3362 // byte
3363 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3364 // byte
3365 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3366 // each byte
3367 "addl $8, %%ebx \n\t"
3368 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3369 // byte
3370 "cmpl _MMXLength, %%ebx \n\t"
3371 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3372 "jb avg_Alp \n\t"
3373
3374 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3375
3376 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3377
3378 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3379 );
3380#endif /* 0 - NEVER REACHED */
3381 }
3382 break;
3383
3384 } // end switch (bpp)
3385
3386 __asm__ __volatile__ (
3387 // MMX acceleration complete; now do clean-up
3388 // check if any remaining bytes left to decode
3389#ifdef __PIC__
3390 "pushl %%ebx \n\t" // save index to Global Offset Table
3391#endif
3392 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3393//pre "movl row, %%edi \n\t" // edi: Avg(x)
3394 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3395 "jnb avg_end \n\t"
3396
3397 // do Avg decode for remaining bytes
3398//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3399 "movl %%edi, %%edx \n\t"
3400//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3401 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3402 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3403
3404 "avg_lp2: \n\t"
3405 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3406 "xorl %%eax, %%eax \n\t"
3407 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3408 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3409 "addw %%cx, %%ax \n\t"
3410 "incl %%ebx \n\t"
3411 "shrw %%ax \n\t" // divide by 2
3412 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3413 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3414 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3415 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3416
3417 "avg_end: \n\t"
3418 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3419#ifdef __PIC__
3420 "popl %%ebx \n\t" // restore index to Global Offset Table
3421#endif
3422
3423 : "=c" (dummy_value_c), // output regs (dummy)
3424 "=S" (dummy_value_S),
3425 "=D" (dummy_value_D)
3426
3427 : "0" (bpp), // ecx // input regs
3428 "1" (prev_row), // esi
3429 "2" (row) // edi
3430
3431 : "%eax", "%edx" // clobber list
3432#ifndef __PIC__
3433 , "%ebx"
3434#endif
3435 );
3436
3437} /* end png_read_filter_row_mmx_avg() */
3438#endif
3439
3440
3441
3442#ifdef PNG_THREAD_UNSAFE_OK
3443//===========================================================================//
3444// //
3445// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3446// //
3447//===========================================================================//
3448
3449// Optimized code for PNG Paeth filter decoder
3450
3451static void /* PRIVATE */
3452png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3453 png_bytep prev_row)
3454{
3455 int bpp;
3456 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3457 int dummy_value_S;
3458 int dummy_value_D;
3459
3460 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3461 _FullLength = row_info->rowbytes; // # of bytes to filter
3462
3463 __asm__ __volatile__ (
3464#ifdef __PIC__
3465 "pushl %%ebx \n\t" // save index to Global Offset Table
3466#endif
3467 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3468//pre "movl row, %%edi \n\t"
3469 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3470//pre "movl prev_row, %%esi \n\t"
3471 "xorl %%eax, %%eax \n\t"
3472
3473 // Compute the Raw value for the first bpp bytes
3474 // Note: the formula works out to be always
3475 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3476 "paeth_rlp: \n\t"
3477 "movb (%%edi,%%ebx,), %%al \n\t"
3478 "addb (%%esi,%%ebx,), %%al \n\t"
3479 "incl %%ebx \n\t"
3480//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3481 "cmpl %%ecx, %%ebx \n\t"
3482 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3483 "jb paeth_rlp \n\t"
3484 // get # of bytes to alignment
3485 "movl %%edi, _dif \n\t" // take start of row
3486 "addl %%ebx, _dif \n\t" // add bpp
3487 "xorl %%ecx, %%ecx \n\t"
3488 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3489 // boundary
3490 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3491 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3492 // at alignment
3493 "jz paeth_go \n\t"
3494 // fix alignment
3495
3496 "paeth_lp1: \n\t"
3497 "xorl %%eax, %%eax \n\t"
3498 // pav = p - a = (a + b - c) - a = b - c
3499 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3500 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3501 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3502 "movl %%eax, _patemp \n\t" // Save pav for later use
3503 "xorl %%eax, %%eax \n\t"
3504 // pbv = p - b = (a + b - c) - b = a - c
3505 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3506 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3507 "movl %%eax, %%ecx \n\t"
3508 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3509 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3510 // pc = abs(pcv)
3511 "testl $0x80000000, %%eax \n\t"
3512 "jz paeth_pca \n\t"
3513 "negl %%eax \n\t" // reverse sign of neg values
3514
3515 "paeth_pca: \n\t"
3516 "movl %%eax, _pctemp \n\t" // save pc for later use
3517 // pb = abs(pbv)
3518 "testl $0x80000000, %%ecx \n\t"
3519 "jz paeth_pba \n\t"
3520 "negl %%ecx \n\t" // reverse sign of neg values
3521
3522 "paeth_pba: \n\t"
3523 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3524 // pa = abs(pav)
3525 "movl _patemp, %%eax \n\t"
3526 "testl $0x80000000, %%eax \n\t"
3527 "jz paeth_paa \n\t"
3528 "negl %%eax \n\t" // reverse sign of neg values
3529
3530 "paeth_paa: \n\t"
3531 "movl %%eax, _patemp \n\t" // save pa for later use
3532 // test if pa <= pb
3533 "cmpl %%ecx, %%eax \n\t"
3534 "jna paeth_abb \n\t"
3535 // pa > pb; now test if pb <= pc
3536 "cmpl _pctemp, %%ecx \n\t"
3537 "jna paeth_bbc \n\t"
3538 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3539 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3540 "jmp paeth_paeth \n\t"
3541
3542 "paeth_bbc: \n\t"
3543 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3544 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3545 "jmp paeth_paeth \n\t"
3546
3547 "paeth_abb: \n\t"
3548 // pa <= pb; now test if pa <= pc
3549 "cmpl _pctemp, %%eax \n\t"
3550 "jna paeth_abc \n\t"
3551 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3552 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3553 "jmp paeth_paeth \n\t"
3554
3555 "paeth_abc: \n\t"
3556 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3557 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3558
3559 "paeth_paeth: \n\t"
3560 "incl %%ebx \n\t"
3561 "incl %%edx \n\t"
3562 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3563 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3564 "cmpl _dif, %%ebx \n\t"
3565 "jb paeth_lp1 \n\t"
3566
3567 "paeth_go: \n\t"
3568 "movl _FullLength, %%ecx \n\t"
3569 "movl %%ecx, %%eax \n\t"
3570 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3571 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3572 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3573 "movl %%ecx, _MMXLength \n\t"
3574#ifdef __PIC__
3575 "popl %%ebx \n\t" // restore index to Global Offset Table
3576#endif
3577
3578 : "=c" (dummy_value_c), // output regs (dummy)
3579 "=S" (dummy_value_S),
3580 "=D" (dummy_value_D)
3581
3582 : "0" (bpp), // ecx // input regs
3583 "1" (prev_row), // esi
3584 "2" (row) // edi
3585
3586 : "%eax", "%edx" // clobber list
3587#ifndef __PIC__
3588 , "%ebx"
3589#endif
3590 );
3591
3592 // now do the math for the rest of the row
3593 switch (bpp)
3594 {
3595 case 3:
3596 {
3597 _ActiveMask.use = 0x0000000000ffffffLL;
3598 _ActiveMaskEnd.use = 0xffff000000000000LL;
3599 _ShiftBpp.use = 24; // == bpp(3) * 8
3600 _ShiftRem.use = 40; // == 64 - 24
3601
3602 __asm__ __volatile__ (
3603 "movl _dif, %%ecx \n\t"
3604// preload "movl row, %%edi \n\t"
3605// preload "movl prev_row, %%esi \n\t"
3606 "pxor %%mm0, %%mm0 \n\t"
3607 // prime the pump: load the first Raw(x-bpp) data set
3608 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3609 "paeth_3lp: \n\t"
3610 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3611 // 3 bytes
3612 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3613 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3614 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3615 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3616 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3617 // 3 bytes
3618 // pav = p - a = (a + b - c) - a = b - c
3619 "movq %%mm2, %%mm4 \n\t"
3620 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3621 // pbv = p - b = (a + b - c) - b = a - c
3622 "movq %%mm1, %%mm5 \n\t"
3623 "psubw %%mm3, %%mm4 \n\t"
3624 "pxor %%mm7, %%mm7 \n\t"
3625 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3626 "movq %%mm4, %%mm6 \n\t"
3627 "psubw %%mm3, %%mm5 \n\t"
3628
3629 // pa = abs(p-a) = abs(pav)
3630 // pb = abs(p-b) = abs(pbv)
3631 // pc = abs(p-c) = abs(pcv)
3632 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3633 "paddw %%mm5, %%mm6 \n\t"
3634 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3635 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3636 "psubw %%mm0, %%mm4 \n\t"
3637 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3638 "psubw %%mm0, %%mm4 \n\t"
3639 "psubw %%mm7, %%mm5 \n\t"
3640 "pxor %%mm0, %%mm0 \n\t"
3641 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3642 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3643 "psubw %%mm7, %%mm5 \n\t"
3644 "psubw %%mm0, %%mm6 \n\t"
3645 // test pa <= pb
3646 "movq %%mm4, %%mm7 \n\t"
3647 "psubw %%mm0, %%mm6 \n\t"
3648 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3649 "movq %%mm7, %%mm0 \n\t"
3650 // use mm7 mask to merge pa & pb
3651 "pand %%mm7, %%mm5 \n\t"
3652 // use mm0 mask copy to merge a & b
3653 "pand %%mm0, %%mm2 \n\t"
3654 "pandn %%mm4, %%mm7 \n\t"
3655 "pandn %%mm1, %%mm0 \n\t"
3656 "paddw %%mm5, %%mm7 \n\t"
3657 "paddw %%mm2, %%mm0 \n\t"
3658 // test ((pa <= pb)? pa:pb) <= pc
3659 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3660 "pxor %%mm1, %%mm1 \n\t"
3661 "pand %%mm7, %%mm3 \n\t"
3662 "pandn %%mm0, %%mm7 \n\t"
3663 "paddw %%mm3, %%mm7 \n\t"
3664 "pxor %%mm0, %%mm0 \n\t"
3665 "packuswb %%mm1, %%mm7 \n\t"
3666 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3667 "pand _ActiveMask, %%mm7 \n\t"
3668 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3669 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3670 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3671 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3672 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3673 // Raw(x-bpp)
3674 // now do Paeth for 2nd set of bytes (3-5)
3675 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3676 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3677 "pxor %%mm7, %%mm7 \n\t"
3678 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3679 // pbv = p - b = (a + b - c) - b = a - c
3680 "movq %%mm1, %%mm5 \n\t"
3681 // pav = p - a = (a + b - c) - a = b - c
3682 "movq %%mm2, %%mm4 \n\t"
3683 "psubw %%mm3, %%mm5 \n\t"
3684 "psubw %%mm3, %%mm4 \n\t"
3685 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3686 // pav + pbv = pbv + pav
3687 "movq %%mm5, %%mm6 \n\t"
3688 "paddw %%mm4, %%mm6 \n\t"
3689
3690 // pa = abs(p-a) = abs(pav)
3691 // pb = abs(p-b) = abs(pbv)
3692 // pc = abs(p-c) = abs(pcv)
3693 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3694 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3695 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3696 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3697 "psubw %%mm0, %%mm5 \n\t"
3698 "psubw %%mm7, %%mm4 \n\t"
3699 "psubw %%mm0, %%mm5 \n\t"
3700 "psubw %%mm7, %%mm4 \n\t"
3701 "pxor %%mm0, %%mm0 \n\t"
3702 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704 "psubw %%mm0, %%mm6 \n\t"
3705 // test pa <= pb
3706 "movq %%mm4, %%mm7 \n\t"
3707 "psubw %%mm0, %%mm6 \n\t"
3708 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3709 "movq %%mm7, %%mm0 \n\t"
3710 // use mm7 mask to merge pa & pb
3711 "pand %%mm7, %%mm5 \n\t"
3712 // use mm0 mask copy to merge a & b
3713 "pand %%mm0, %%mm2 \n\t"
3714 "pandn %%mm4, %%mm7 \n\t"
3715 "pandn %%mm1, %%mm0 \n\t"
3716 "paddw %%mm5, %%mm7 \n\t"
3717 "paddw %%mm2, %%mm0 \n\t"
3718 // test ((pa <= pb)? pa:pb) <= pc
3719 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3720 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3721 "pand %%mm7, %%mm3 \n\t"
3722 "pandn %%mm0, %%mm7 \n\t"
3723 "pxor %%mm1, %%mm1 \n\t"
3724 "paddw %%mm3, %%mm7 \n\t"
3725 "pxor %%mm0, %%mm0 \n\t"
3726 "packuswb %%mm1, %%mm7 \n\t"
3727 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3728 "pand _ActiveMask, %%mm7 \n\t"
3729 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3730 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3731 // 3 bytes
3732 // pav = p - a = (a + b - c) - a = b - c
3733 "movq %%mm2, %%mm4 \n\t"
3734 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3735 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3736 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3737 "movq %%mm7, %%mm1 \n\t"
3738 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3739 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3740 // now mm1 will be used as Raw(x-bpp)
3741 // now do Paeth for 3rd, and final, set of bytes (6-7)
3742 "pxor %%mm7, %%mm7 \n\t"
3743 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3744 "psubw %%mm3, %%mm4 \n\t"
3745 // pbv = p - b = (a + b - c) - b = a - c
3746 "movq %%mm1, %%mm5 \n\t"
3747 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3748 "movq %%mm4, %%mm6 \n\t"
3749 "psubw %%mm3, %%mm5 \n\t"
3750 "pxor %%mm0, %%mm0 \n\t"
3751 "paddw %%mm5, %%mm6 \n\t"
3752
3753 // pa = abs(p-a) = abs(pav)
3754 // pb = abs(p-b) = abs(pbv)
3755 // pc = abs(p-c) = abs(pcv)
3756 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3757 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3758 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3759 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3760 "psubw %%mm0, %%mm4 \n\t"
3761 "psubw %%mm7, %%mm5 \n\t"
3762 "psubw %%mm0, %%mm4 \n\t"
3763 "psubw %%mm7, %%mm5 \n\t"
3764 "pxor %%mm0, %%mm0 \n\t"
3765 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3766 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3767 "psubw %%mm0, %%mm6 \n\t"
3768 // test pa <= pb
3769 "movq %%mm4, %%mm7 \n\t"
3770 "psubw %%mm0, %%mm6 \n\t"
3771 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3772 "movq %%mm7, %%mm0 \n\t"
3773 // use mm0 mask copy to merge a & b
3774 "pand %%mm0, %%mm2 \n\t"
3775 // use mm7 mask to merge pa & pb
3776 "pand %%mm7, %%mm5 \n\t"
3777 "pandn %%mm1, %%mm0 \n\t"
3778 "pandn %%mm4, %%mm7 \n\t"
3779 "paddw %%mm2, %%mm0 \n\t"
3780 "paddw %%mm5, %%mm7 \n\t"
3781 // test ((pa <= pb)? pa:pb) <= pc
3782 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3783 "pand %%mm7, %%mm3 \n\t"
3784 "pandn %%mm0, %%mm7 \n\t"
3785 "paddw %%mm3, %%mm7 \n\t"
3786 "pxor %%mm1, %%mm1 \n\t"
3787 "packuswb %%mm7, %%mm1 \n\t"
3788 // step ecx to next set of 8 bytes and repeat loop til done
3789 "addl $8, %%ecx \n\t"
3790 "pand _ActiveMaskEnd, %%mm1 \n\t"
3791 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3792 // Raw(x)
3793
3794 "cmpl _MMXLength, %%ecx \n\t"
3795 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3796 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3797 // mm1 will be used as Raw(x-bpp) next loop
3798 // mm3 ready to be used as Prior(x-bpp) next loop
3799 "jb paeth_3lp \n\t"
3800
3801 : "=S" (dummy_value_S), // output regs (dummy)
3802 "=D" (dummy_value_D)
3803
3804 : "0" (prev_row), // esi // input regs
3805 "1" (row) // edi
3806
3807 : "%ecx" // clobber list
3808#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3809 , "%mm0", "%mm1", "%mm2", "%mm3"
3810 , "%mm4", "%mm5", "%mm6", "%mm7"
3811#endif
3812 );
3813 }
3814 break; // end 3 bpp
3815
3816 case 6:
3817 //case 7: // GRR BOGUS
3818 //case 5: // GRR BOGUS
3819 {
3820 _ActiveMask.use = 0x00000000ffffffffLL;
3821 _ActiveMask2.use = 0xffffffff00000000LL;
3822 _ShiftBpp.use = bpp << 3; // == bpp * 8
3823 _ShiftRem.use = 64 - _ShiftBpp.use;
3824
3825 __asm__ __volatile__ (
3826 "movl _dif, %%ecx \n\t"
3827// preload "movl row, %%edi \n\t"
3828// preload "movl prev_row, %%esi \n\t"
3829 // prime the pump: load the first Raw(x-bpp) data set
3830 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3831 "pxor %%mm0, %%mm0 \n\t"
3832
3833 "paeth_6lp: \n\t"
3834 // must shift to position Raw(x-bpp) data
3835 "psrlq _ShiftRem, %%mm1 \n\t"
3836 // do first set of 4 bytes
3837 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3838 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3839 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3840 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3841 // must shift to position Prior(x-bpp) data
3842 "psrlq _ShiftRem, %%mm3 \n\t"
3843 // pav = p - a = (a + b - c) - a = b - c
3844 "movq %%mm2, %%mm4 \n\t"
3845 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3846 // pbv = p - b = (a + b - c) - b = a - c
3847 "movq %%mm1, %%mm5 \n\t"
3848 "psubw %%mm3, %%mm4 \n\t"
3849 "pxor %%mm7, %%mm7 \n\t"
3850 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3851 "movq %%mm4, %%mm6 \n\t"
3852 "psubw %%mm3, %%mm5 \n\t"
3853 // pa = abs(p-a) = abs(pav)
3854 // pb = abs(p-b) = abs(pbv)
3855 // pc = abs(p-c) = abs(pcv)
3856 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3857 "paddw %%mm5, %%mm6 \n\t"
3858 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3859 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3860 "psubw %%mm0, %%mm4 \n\t"
3861 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3862 "psubw %%mm0, %%mm4 \n\t"
3863 "psubw %%mm7, %%mm5 \n\t"
3864 "pxor %%mm0, %%mm0 \n\t"
3865 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3866 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3867 "psubw %%mm7, %%mm5 \n\t"
3868 "psubw %%mm0, %%mm6 \n\t"
3869 // test pa <= pb
3870 "movq %%mm4, %%mm7 \n\t"
3871 "psubw %%mm0, %%mm6 \n\t"
3872 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3873 "movq %%mm7, %%mm0 \n\t"
3874 // use mm7 mask to merge pa & pb
3875 "pand %%mm7, %%mm5 \n\t"
3876 // use mm0 mask copy to merge a & b
3877 "pand %%mm0, %%mm2 \n\t"
3878 "pandn %%mm4, %%mm7 \n\t"
3879 "pandn %%mm1, %%mm0 \n\t"
3880 "paddw %%mm5, %%mm7 \n\t"
3881 "paddw %%mm2, %%mm0 \n\t"
3882 // test ((pa <= pb)? pa:pb) <= pc
3883 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3884 "pxor %%mm1, %%mm1 \n\t"
3885 "pand %%mm7, %%mm3 \n\t"
3886 "pandn %%mm0, %%mm7 \n\t"
3887 "paddw %%mm3, %%mm7 \n\t"
3888 "pxor %%mm0, %%mm0 \n\t"
3889 "packuswb %%mm1, %%mm7 \n\t"
3890 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3891 "pand _ActiveMask, %%mm7 \n\t"
3892 "psrlq _ShiftRem, %%mm3 \n\t"
3893 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3894 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3895 "movq %%mm2, %%mm6 \n\t"
3896 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3897 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3898 "psllq _ShiftBpp, %%mm6 \n\t"
3899 "movq %%mm7, %%mm5 \n\t"
3900 "psrlq _ShiftRem, %%mm1 \n\t"
3901 "por %%mm6, %%mm3 \n\t"
3902 "psllq _ShiftBpp, %%mm5 \n\t"
3903 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3904 "por %%mm5, %%mm1 \n\t"
3905 // do second set of 4 bytes
3906 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3907 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3908 // pav = p - a = (a + b - c) - a = b - c
3909 "movq %%mm2, %%mm4 \n\t"
3910 // pbv = p - b = (a + b - c) - b = a - c
3911 "movq %%mm1, %%mm5 \n\t"
3912 "psubw %%mm3, %%mm4 \n\t"
3913 "pxor %%mm7, %%mm7 \n\t"
3914 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3915 "movq %%mm4, %%mm6 \n\t"
3916 "psubw %%mm3, %%mm5 \n\t"
3917 // pa = abs(p-a) = abs(pav)
3918 // pb = abs(p-b) = abs(pbv)
3919 // pc = abs(p-c) = abs(pcv)
3920 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3921 "paddw %%mm5, %%mm6 \n\t"
3922 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3923 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3924 "psubw %%mm0, %%mm4 \n\t"
3925 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3926 "psubw %%mm0, %%mm4 \n\t"
3927 "psubw %%mm7, %%mm5 \n\t"
3928 "pxor %%mm0, %%mm0 \n\t"
3929 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3930 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3931 "psubw %%mm7, %%mm5 \n\t"
3932 "psubw %%mm0, %%mm6 \n\t"
3933 // test pa <= pb
3934 "movq %%mm4, %%mm7 \n\t"
3935 "psubw %%mm0, %%mm6 \n\t"
3936 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3937 "movq %%mm7, %%mm0 \n\t"
3938 // use mm7 mask to merge pa & pb
3939 "pand %%mm7, %%mm5 \n\t"
3940 // use mm0 mask copy to merge a & b
3941 "pand %%mm0, %%mm2 \n\t"
3942 "pandn %%mm4, %%mm7 \n\t"
3943 "pandn %%mm1, %%mm0 \n\t"
3944 "paddw %%mm5, %%mm7 \n\t"
3945 "paddw %%mm2, %%mm0 \n\t"
3946 // test ((pa <= pb)? pa:pb) <= pc
3947 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3948 "pxor %%mm1, %%mm1 \n\t"
3949 "pand %%mm7, %%mm3 \n\t"
3950 "pandn %%mm0, %%mm7 \n\t"
3951 "pxor %%mm1, %%mm1 \n\t"
3952 "paddw %%mm3, %%mm7 \n\t"
3953 "pxor %%mm0, %%mm0 \n\t"
3954 // step ecx to next set of 8 bytes and repeat loop til done
3955 "addl $8, %%ecx \n\t"
3956 "packuswb %%mm7, %%mm1 \n\t"
3957 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3958 "cmpl _MMXLength, %%ecx \n\t"
3959 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3960 // mm1 will be used as Raw(x-bpp) next loop
3961 "jb paeth_6lp \n\t"
3962
3963 : "=S" (dummy_value_S), // output regs (dummy)
3964 "=D" (dummy_value_D)
3965
3966 : "0" (prev_row), // esi // input regs
3967 "1" (row) // edi
3968
3969 : "%ecx" // clobber list
3970#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3971 , "%mm0", "%mm1", "%mm2", "%mm3"
3972 , "%mm4", "%mm5", "%mm6", "%mm7"
3973#endif
3974 );
3975 }
3976 break; // end 6 bpp
3977
3978 case 4:
3979 {
3980 _ActiveMask.use = 0x00000000ffffffffLL;
3981
3982 __asm__ __volatile__ (
3983 "movl _dif, %%ecx \n\t"
3984// preload "movl row, %%edi \n\t"
3985// preload "movl prev_row, %%esi \n\t"
3986 "pxor %%mm0, %%mm0 \n\t"
3987 // prime the pump: load the first Raw(x-bpp) data set
3988 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3989 // a=Raw(x-bpp) bytes
3990 "paeth_4lp: \n\t"
3991 // do first set of 4 bytes
3992 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3993 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3994 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3995 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3996 // pav = p - a = (a + b - c) - a = b - c
3997 "movq %%mm2, %%mm4 \n\t"
3998 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3999 // pbv = p - b = (a + b - c) - b = a - c
4000 "movq %%mm1, %%mm5 \n\t"
4001 "psubw %%mm3, %%mm4 \n\t"
4002 "pxor %%mm7, %%mm7 \n\t"
4003 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4004 "movq %%mm4, %%mm6 \n\t"
4005 "psubw %%mm3, %%mm5 \n\t"
4006 // pa = abs(p-a) = abs(pav)
4007 // pb = abs(p-b) = abs(pbv)
4008 // pc = abs(p-c) = abs(pcv)
4009 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4010 "paddw %%mm5, %%mm6 \n\t"
4011 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4012 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4013 "psubw %%mm0, %%mm4 \n\t"
4014 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4015 "psubw %%mm0, %%mm4 \n\t"
4016 "psubw %%mm7, %%mm5 \n\t"
4017 "pxor %%mm0, %%mm0 \n\t"
4018 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4019 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4020 "psubw %%mm7, %%mm5 \n\t"
4021 "psubw %%mm0, %%mm6 \n\t"
4022 // test pa <= pb
4023 "movq %%mm4, %%mm7 \n\t"
4024 "psubw %%mm0, %%mm6 \n\t"
4025 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4026 "movq %%mm7, %%mm0 \n\t"
4027 // use mm7 mask to merge pa & pb
4028 "pand %%mm7, %%mm5 \n\t"
4029 // use mm0 mask copy to merge a & b
4030 "pand %%mm0, %%mm2 \n\t"
4031 "pandn %%mm4, %%mm7 \n\t"
4032 "pandn %%mm1, %%mm0 \n\t"
4033 "paddw %%mm5, %%mm7 \n\t"
4034 "paddw %%mm2, %%mm0 \n\t"
4035 // test ((pa <= pb)? pa:pb) <= pc
4036 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4037 "pxor %%mm1, %%mm1 \n\t"
4038 "pand %%mm7, %%mm3 \n\t"
4039 "pandn %%mm0, %%mm7 \n\t"
4040 "paddw %%mm3, %%mm7 \n\t"
4041 "pxor %%mm0, %%mm0 \n\t"
4042 "packuswb %%mm1, %%mm7 \n\t"
4043 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4044 "pand _ActiveMask, %%mm7 \n\t"
4045 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4046 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4047 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4048 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4049 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4050 // do second set of 4 bytes
4051 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4052 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4053 // pav = p - a = (a + b - c) - a = b - c
4054 "movq %%mm2, %%mm4 \n\t"
4055 // pbv = p - b = (a + b - c) - b = a - c
4056 "movq %%mm1, %%mm5 \n\t"
4057 "psubw %%mm3, %%mm4 \n\t"
4058 "pxor %%mm7, %%mm7 \n\t"
4059 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4060 "movq %%mm4, %%mm6 \n\t"
4061 "psubw %%mm3, %%mm5 \n\t"
4062 // pa = abs(p-a) = abs(pav)
4063 // pb = abs(p-b) = abs(pbv)
4064 // pc = abs(p-c) = abs(pcv)
4065 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4066 "paddw %%mm5, %%mm6 \n\t"
4067 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4068 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4069 "psubw %%mm0, %%mm4 \n\t"
4070 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4071 "psubw %%mm0, %%mm4 \n\t"
4072 "psubw %%mm7, %%mm5 \n\t"
4073 "pxor %%mm0, %%mm0 \n\t"
4074 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4075 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4076 "psubw %%mm7, %%mm5 \n\t"
4077 "psubw %%mm0, %%mm6 \n\t"
4078 // test pa <= pb
4079 "movq %%mm4, %%mm7 \n\t"
4080 "psubw %%mm0, %%mm6 \n\t"
4081 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4082 "movq %%mm7, %%mm0 \n\t"
4083 // use mm7 mask to merge pa & pb
4084 "pand %%mm7, %%mm5 \n\t"
4085 // use mm0 mask copy to merge a & b
4086 "pand %%mm0, %%mm2 \n\t"
4087 "pandn %%mm4, %%mm7 \n\t"
4088 "pandn %%mm1, %%mm0 \n\t"
4089 "paddw %%mm5, %%mm7 \n\t"
4090 "paddw %%mm2, %%mm0 \n\t"
4091 // test ((pa <= pb)? pa:pb) <= pc
4092 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4093 "pxor %%mm1, %%mm1 \n\t"
4094 "pand %%mm7, %%mm3 \n\t"
4095 "pandn %%mm0, %%mm7 \n\t"
4096 "pxor %%mm1, %%mm1 \n\t"
4097 "paddw %%mm3, %%mm7 \n\t"
4098 "pxor %%mm0, %%mm0 \n\t"
4099 // step ecx to next set of 8 bytes and repeat loop til done
4100 "addl $8, %%ecx \n\t"
4101 "packuswb %%mm7, %%mm1 \n\t"
4102 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4103 "cmpl _MMXLength, %%ecx \n\t"
4104 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4105 // mm1 will be used as Raw(x-bpp) next loop
4106 "jb paeth_4lp \n\t"
4107
4108 : "=S" (dummy_value_S), // output regs (dummy)
4109 "=D" (dummy_value_D)
4110
4111 : "0" (prev_row), // esi // input regs
4112 "1" (row) // edi
4113
4114 : "%ecx" // clobber list
4115#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4116 , "%mm0", "%mm1", "%mm2", "%mm3"
4117 , "%mm4", "%mm5", "%mm6", "%mm7"
4118#endif
4119 );
4120 }
4121 break; // end 4 bpp
4122
4123 case 8: // bpp == 8
4124 {
4125 _ActiveMask.use = 0x00000000ffffffffLL;
4126
4127 __asm__ __volatile__ (
4128 "movl _dif, %%ecx \n\t"
4129// preload "movl row, %%edi \n\t"
4130// preload "movl prev_row, %%esi \n\t"
4131 "pxor %%mm0, %%mm0 \n\t"
4132 // prime the pump: load the first Raw(x-bpp) data set
4133 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4134 // a=Raw(x-bpp) bytes
4135 "paeth_8lp: \n\t"
4136 // do first set of 4 bytes
4137 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4138 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4139 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4140 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4141 // pav = p - a = (a + b - c) - a = b - c
4142 "movq %%mm2, %%mm4 \n\t"
4143 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4144 // pbv = p - b = (a + b - c) - b = a - c
4145 "movq %%mm1, %%mm5 \n\t"
4146 "psubw %%mm3, %%mm4 \n\t"
4147 "pxor %%mm7, %%mm7 \n\t"
4148 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4149 "movq %%mm4, %%mm6 \n\t"
4150 "psubw %%mm3, %%mm5 \n\t"
4151 // pa = abs(p-a) = abs(pav)
4152 // pb = abs(p-b) = abs(pbv)
4153 // pc = abs(p-c) = abs(pcv)
4154 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4155 "paddw %%mm5, %%mm6 \n\t"
4156 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4157 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4158 "psubw %%mm0, %%mm4 \n\t"
4159 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4160 "psubw %%mm0, %%mm4 \n\t"
4161 "psubw %%mm7, %%mm5 \n\t"
4162 "pxor %%mm0, %%mm0 \n\t"
4163 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4164 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4165 "psubw %%mm7, %%mm5 \n\t"
4166 "psubw %%mm0, %%mm6 \n\t"
4167 // test pa <= pb
4168 "movq %%mm4, %%mm7 \n\t"
4169 "psubw %%mm0, %%mm6 \n\t"
4170 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4171 "movq %%mm7, %%mm0 \n\t"
4172 // use mm7 mask to merge pa & pb
4173 "pand %%mm7, %%mm5 \n\t"
4174 // use mm0 mask copy to merge a & b
4175 "pand %%mm0, %%mm2 \n\t"
4176 "pandn %%mm4, %%mm7 \n\t"
4177 "pandn %%mm1, %%mm0 \n\t"
4178 "paddw %%mm5, %%mm7 \n\t"
4179 "paddw %%mm2, %%mm0 \n\t"
4180 // test ((pa <= pb)? pa:pb) <= pc
4181 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4182 "pxor %%mm1, %%mm1 \n\t"
4183 "pand %%mm7, %%mm3 \n\t"
4184 "pandn %%mm0, %%mm7 \n\t"
4185 "paddw %%mm3, %%mm7 \n\t"
4186 "pxor %%mm0, %%mm0 \n\t"
4187 "packuswb %%mm1, %%mm7 \n\t"
4188 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4189 "pand _ActiveMask, %%mm7 \n\t"
4190 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4191 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4192 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4193 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4194 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4195
4196 // do second set of 4 bytes
4197 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4198 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4199 // pav = p - a = (a + b - c) - a = b - c
4200 "movq %%mm2, %%mm4 \n\t"
4201 // pbv = p - b = (a + b - c) - b = a - c
4202 "movq %%mm1, %%mm5 \n\t"
4203 "psubw %%mm3, %%mm4 \n\t"
4204 "pxor %%mm7, %%mm7 \n\t"
4205 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4206 "movq %%mm4, %%mm6 \n\t"
4207 "psubw %%mm3, %%mm5 \n\t"
4208 // pa = abs(p-a) = abs(pav)
4209 // pb = abs(p-b) = abs(pbv)
4210 // pc = abs(p-c) = abs(pcv)
4211 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4212 "paddw %%mm5, %%mm6 \n\t"
4213 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4214 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4215 "psubw %%mm0, %%mm4 \n\t"
4216 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4217 "psubw %%mm0, %%mm4 \n\t"
4218 "psubw %%mm7, %%mm5 \n\t"
4219 "pxor %%mm0, %%mm0 \n\t"
4220 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4221 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4222 "psubw %%mm7, %%mm5 \n\t"
4223 "psubw %%mm0, %%mm6 \n\t"
4224 // test pa <= pb
4225 "movq %%mm4, %%mm7 \n\t"
4226 "psubw %%mm0, %%mm6 \n\t"
4227 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4228 "movq %%mm7, %%mm0 \n\t"
4229 // use mm7 mask to merge pa & pb
4230 "pand %%mm7, %%mm5 \n\t"
4231 // use mm0 mask copy to merge a & b
4232 "pand %%mm0, %%mm2 \n\t"
4233 "pandn %%mm4, %%mm7 \n\t"
4234 "pandn %%mm1, %%mm0 \n\t"
4235 "paddw %%mm5, %%mm7 \n\t"
4236 "paddw %%mm2, %%mm0 \n\t"
4237 // test ((pa <= pb)? pa:pb) <= pc
4238 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4239 "pxor %%mm1, %%mm1 \n\t"
4240 "pand %%mm7, %%mm3 \n\t"
4241 "pandn %%mm0, %%mm7 \n\t"
4242 "pxor %%mm1, %%mm1 \n\t"
4243 "paddw %%mm3, %%mm7 \n\t"
4244 "pxor %%mm0, %%mm0 \n\t"
4245 // step ecx to next set of 8 bytes and repeat loop til done
4246 "addl $8, %%ecx \n\t"
4247 "packuswb %%mm7, %%mm1 \n\t"
4248 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4249 "cmpl _MMXLength, %%ecx \n\t"
4250 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4251 // mm1 will be used as Raw(x-bpp) next loop
4252 "jb paeth_8lp \n\t"
4253
4254 : "=S" (dummy_value_S), // output regs (dummy)
4255 "=D" (dummy_value_D)
4256
4257 : "0" (prev_row), // esi // input regs
4258 "1" (row) // edi
4259
4260 : "%ecx" // clobber list
4261#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4262 , "%mm0", "%mm1", "%mm2", "%mm3"
4263 , "%mm4", "%mm5", "%mm6", "%mm7"
4264#endif
4265 );
4266 }
4267 break; // end 8 bpp
4268
4269 case 1: // bpp = 1
4270 case 2: // bpp = 2
4271 default: // bpp > 8
4272 {
4273 __asm__ __volatile__ (
4274#ifdef __PIC__
4275 "pushl %%ebx \n\t" // save Global Offset Table index
4276#endif
4277 "movl _dif, %%ebx \n\t"
4278 "cmpl _FullLength, %%ebx \n\t"
4279 "jnb paeth_dend \n\t"
4280
4281// preload "movl row, %%edi \n\t"
4282// preload "movl prev_row, %%esi \n\t"
4283 // do Paeth decode for remaining bytes
4284 "movl %%ebx, %%edx \n\t"
4285// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4286 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4287 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4288
4289 "paeth_dlp: \n\t"
4290 "xorl %%eax, %%eax \n\t"
4291 // pav = p - a = (a + b - c) - a = b - c
4292 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4293 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4294 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4295 "movl %%eax, _patemp \n\t" // Save pav for later use
4296 "xorl %%eax, %%eax \n\t"
4297 // pbv = p - b = (a + b - c) - b = a - c
4298 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4299 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4300 "movl %%eax, %%ecx \n\t"
4301 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4302 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4303 // pc = abs(pcv)
4304 "testl $0x80000000, %%eax \n\t"
4305 "jz paeth_dpca \n\t"
4306 "negl %%eax \n\t" // reverse sign of neg values
4307
4308 "paeth_dpca: \n\t"
4309 "movl %%eax, _pctemp \n\t" // save pc for later use
4310 // pb = abs(pbv)
4311 "testl $0x80000000, %%ecx \n\t"
4312 "jz paeth_dpba \n\t"
4313 "negl %%ecx \n\t" // reverse sign of neg values
4314
4315 "paeth_dpba: \n\t"
4316 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4317 // pa = abs(pav)
4318 "movl _patemp, %%eax \n\t"
4319 "testl $0x80000000, %%eax \n\t"
4320 "jz paeth_dpaa \n\t"
4321 "negl %%eax \n\t" // reverse sign of neg values
4322
4323 "paeth_dpaa: \n\t"
4324 "movl %%eax, _patemp \n\t" // save pa for later use
4325 // test if pa <= pb
4326 "cmpl %%ecx, %%eax \n\t"
4327 "jna paeth_dabb \n\t"
4328 // pa > pb; now test if pb <= pc
4329 "cmpl _pctemp, %%ecx \n\t"
4330 "jna paeth_dbbc \n\t"
4331 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4332 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4333 "jmp paeth_dpaeth \n\t"
4334
4335 "paeth_dbbc: \n\t"
4336 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4337 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4338 "jmp paeth_dpaeth \n\t"
4339
4340 "paeth_dabb: \n\t"
4341 // pa <= pb; now test if pa <= pc
4342 "cmpl _pctemp, %%eax \n\t"
4343 "jna paeth_dabc \n\t"
4344 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4345 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4346 "jmp paeth_dpaeth \n\t"
4347
4348 "paeth_dabc: \n\t"
4349 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4350 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4351
4352 "paeth_dpaeth: \n\t"
4353 "incl %%ebx \n\t"
4354 "incl %%edx \n\t"
4355 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4356 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4357 "cmpl _FullLength, %%ebx \n\t"
4358 "jb paeth_dlp \n\t"
4359
4360 "paeth_dend: \n\t"
4361#ifdef __PIC__
4362 "popl %%ebx \n\t" // index to Global Offset Table
4363#endif
4364
4365 : "=c" (dummy_value_c), // output regs (dummy)
4366 "=S" (dummy_value_S),
4367 "=D" (dummy_value_D)
4368
4369 : "0" (bpp), // ecx // input regs
4370 "1" (prev_row), // esi
4371 "2" (row) // edi
4372
4373 : "%eax", "%edx" // clobber list
4374#ifndef __PIC__
4375 , "%ebx"
4376#endif
4377 );
4378 }
4379 return; // No need to go further with this one
4380
4381 } // end switch (bpp)
4382
4383 __asm__ __volatile__ (
4384 // MMX acceleration complete; now do clean-up
4385 // check if any remaining bytes left to decode
4386#ifdef __PIC__
4387 "pushl %%ebx \n\t" // save index to Global Offset Table
4388#endif
4389 "movl _MMXLength, %%ebx \n\t"
4390 "cmpl _FullLength, %%ebx \n\t"
4391 "jnb paeth_end \n\t"
4392//pre "movl row, %%edi \n\t"
4393//pre "movl prev_row, %%esi \n\t"
4394 // do Paeth decode for remaining bytes
4395 "movl %%ebx, %%edx \n\t"
4396//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4397 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4398 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4399
4400 "paeth_lp2: \n\t"
4401 "xorl %%eax, %%eax \n\t"
4402 // pav = p - a = (a + b - c) - a = b - c
4403 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4404 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4405 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4406 "movl %%eax, _patemp \n\t" // Save pav for later use
4407 "xorl %%eax, %%eax \n\t"
4408 // pbv = p - b = (a + b - c) - b = a - c
4409 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4410 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4411 "movl %%eax, %%ecx \n\t"
4412 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4413 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4414 // pc = abs(pcv)
4415 "testl $0x80000000, %%eax \n\t"
4416 "jz paeth_pca2 \n\t"
4417 "negl %%eax \n\t" // reverse sign of neg values
4418
4419 "paeth_pca2: \n\t"
4420 "movl %%eax, _pctemp \n\t" // save pc for later use
4421 // pb = abs(pbv)
4422 "testl $0x80000000, %%ecx \n\t"
4423 "jz paeth_pba2 \n\t"
4424 "negl %%ecx \n\t" // reverse sign of neg values
4425
4426 "paeth_pba2: \n\t"
4427 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4428 // pa = abs(pav)
4429 "movl _patemp, %%eax \n\t"
4430 "testl $0x80000000, %%eax \n\t"
4431 "jz paeth_paa2 \n\t"
4432 "negl %%eax \n\t" // reverse sign of neg values
4433
4434 "paeth_paa2: \n\t"
4435 "movl %%eax, _patemp \n\t" // save pa for later use
4436 // test if pa <= pb
4437 "cmpl %%ecx, %%eax \n\t"
4438 "jna paeth_abb2 \n\t"
4439 // pa > pb; now test if pb <= pc
4440 "cmpl _pctemp, %%ecx \n\t"
4441 "jna paeth_bbc2 \n\t"
4442 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4443 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4444 "jmp paeth_paeth2 \n\t"
4445
4446 "paeth_bbc2: \n\t"
4447 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4448 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4449 "jmp paeth_paeth2 \n\t"
4450
4451 "paeth_abb2: \n\t"
4452 // pa <= pb; now test if pa <= pc
4453 "cmpl _pctemp, %%eax \n\t"
4454 "jna paeth_abc2 \n\t"
4455 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4456 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4457 "jmp paeth_paeth2 \n\t"
4458
4459 "paeth_abc2: \n\t"
4460 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4461 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4462
4463 "paeth_paeth2: \n\t"
4464 "incl %%ebx \n\t"
4465 "incl %%edx \n\t"
4466 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4467 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4468 "cmpl _FullLength, %%ebx \n\t"
4469 "jb paeth_lp2 \n\t"
4470
4471 "paeth_end: \n\t"
4472 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4473#ifdef __PIC__
4474 "popl %%ebx \n\t" // restore index to Global Offset Table
4475#endif
4476
4477 : "=c" (dummy_value_c), // output regs (dummy)
4478 "=S" (dummy_value_S),
4479 "=D" (dummy_value_D)
4480
4481 : "0" (bpp), // ecx // input regs
4482 "1" (prev_row), // esi
4483 "2" (row) // edi
4484
4485 : "%eax", "%edx" // clobber list (no input regs!)
4486#ifndef __PIC__
4487 , "%ebx"
4488#endif
4489 );
4490
4491} /* end png_read_filter_row_mmx_paeth() */
4492#endif
4493
4494
4495
4496
4497#ifdef PNG_THREAD_UNSAFE_OK
4498//===========================================================================//
4499// //
4500// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4501// //
4502//===========================================================================//
4503
4504// Optimized code for PNG Sub filter decoder
4505
4506static void /* PRIVATE */
4507png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4508{
4509 int bpp;
4510 int dummy_value_a;
4511 int dummy_value_D;
4512
4513 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4514 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4515
4516 __asm__ __volatile__ (
4517//pre "movl row, %%edi \n\t"
4518 "movl %%edi, %%esi \n\t" // lp = row
4519//pre "movl bpp, %%eax \n\t"
4520 "addl %%eax, %%edi \n\t" // rp = row + bpp
4521//irr "xorl %%eax, %%eax \n\t"
4522 // get # of bytes to alignment
4523 "movl %%edi, _dif \n\t" // take start of row
4524 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4525 // alignment boundary
4526 "xorl %%ecx, %%ecx \n\t"
4527 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4528 "subl %%edi, _dif \n\t" // subtract from start ==> value
4529 "jz sub_go \n\t" // ecx at alignment
4530
4531 "sub_lp1: \n\t" // fix alignment
4532 "movb (%%esi,%%ecx,), %%al \n\t"
4533 "addb %%al, (%%edi,%%ecx,) \n\t"
4534 "incl %%ecx \n\t"
4535 "cmpl _dif, %%ecx \n\t"
4536 "jb sub_lp1 \n\t"
4537
4538 "sub_go: \n\t"
4539 "movl _FullLength, %%eax \n\t"
4540 "movl %%eax, %%edx \n\t"
4541 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4542 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4543 "subl %%edx, %%eax \n\t" // drop over bytes from length
4544 "movl %%eax, _MMXLength \n\t"
4545
4546 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4547 "=D" (dummy_value_D) // 1
4548
4549 : "0" (bpp), // eax // input regs
4550 "1" (row) // edi
4551
4552 : "%esi", "%ecx", "%edx" // clobber list
4553
4554#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4555 , "%mm0", "%mm1", "%mm2", "%mm3"
4556 , "%mm4", "%mm5", "%mm6", "%mm7"
4557#endif
4558 );
4559
4560 // now do the math for the rest of the row
4561 switch (bpp)
4562 {
4563 case 3:
4564 {
4565 _ActiveMask.use = 0x0000ffffff000000LL;
4566 _ShiftBpp.use = 24; // == 3 * 8
4567 _ShiftRem.use = 40; // == 64 - 24
4568
4569 __asm__ __volatile__ (
4570// preload "movl row, %%edi \n\t"
4571 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4572 // active byte group
4573 "movl %%edi, %%esi \n\t" // lp = row
4574// preload "movl bpp, %%eax \n\t"
4575 "addl %%eax, %%edi \n\t" // rp = row + bpp
4576 "movq %%mm7, %%mm6 \n\t"
4577 "movl _dif, %%edx \n\t"
4578 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4579 // 3rd active byte group
4580 // prime the pump: load the first Raw(x-bpp) data set
4581 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4582
4583 "sub_3lp: \n\t" // shift data for adding first
4584 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4585 // shift clears inactive bytes)
4586 // add 1st active group
4587 "movq (%%edi,%%edx,), %%mm0 \n\t"
4588 "paddb %%mm1, %%mm0 \n\t"
4589
4590 // add 2nd active group
4591 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4592 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4593 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4594 "paddb %%mm1, %%mm0 \n\t"
4595
4596 // add 3rd active group
4597 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4598 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4599 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4600 "addl $8, %%edx \n\t"
4601 "paddb %%mm1, %%mm0 \n\t"
4602
4603 "cmpl _MMXLength, %%edx \n\t"
4604 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4605 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4606 "jb sub_3lp \n\t"
4607
4608 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4609 "=D" (dummy_value_D) // 1
4610
4611 : "0" (bpp), // eax // input regs
4612 "1" (row) // edi
4613
4614 : "%edx", "%esi" // clobber list
4615#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4616 , "%mm0", "%mm1", "%mm6", "%mm7"
4617#endif
4618 );
4619 }
4620 break;
4621
4622 case 1:
4623 {
4624 __asm__ __volatile__ (
4625 "movl _dif, %%edx \n\t"
4626// preload "movl row, %%edi \n\t"
4627 "cmpl _FullLength, %%edx \n\t"
4628 "jnb sub_1end \n\t"
4629 "movl %%edi, %%esi \n\t" // lp = row
4630 "xorl %%eax, %%eax \n\t"
4631// preload "movl bpp, %%eax \n\t"
4632 "addl %%eax, %%edi \n\t" // rp = row + bpp
4633
4634 "sub_1lp: \n\t"
4635 "movb (%%esi,%%edx,), %%al \n\t"
4636 "addb %%al, (%%edi,%%edx,) \n\t"
4637 "incl %%edx \n\t"
4638 "cmpl _FullLength, %%edx \n\t"
4639 "jb sub_1lp \n\t"
4640
4641 "sub_1end: \n\t"
4642
4643 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4644 "=D" (dummy_value_D) // 1
4645
4646 : "0" (bpp), // eax // input regs
4647 "1" (row) // edi
4648
4649 : "%edx", "%esi" // clobber list
4650 );
4651 }
4652 return;
4653
4654 case 6:
4655 case 4:
4656 //case 7: // GRR BOGUS
4657 //case 5: // GRR BOGUS
4658 {
4659 _ShiftBpp.use = bpp << 3;
4660 _ShiftRem.use = 64 - _ShiftBpp.use;
4661
4662 __asm__ __volatile__ (
4663// preload "movl row, %%edi \n\t"
4664 "movl _dif, %%edx \n\t"
4665 "movl %%edi, %%esi \n\t" // lp = row
4666// preload "movl bpp, %%eax \n\t"
4667 "addl %%eax, %%edi \n\t" // rp = row + bpp
4668
4669 // prime the pump: load the first Raw(x-bpp) data set
4670 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4671
4672 "sub_4lp: \n\t" // shift data for adding first
4673 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4674 // shift clears inactive bytes)
4675 "movq (%%edi,%%edx,), %%mm0 \n\t"
4676 "paddb %%mm1, %%mm0 \n\t"
4677
4678 // add 2nd active group
4679 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4680 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4681 "addl $8, %%edx \n\t"
4682 "paddb %%mm1, %%mm0 \n\t"
4683
4684 "cmpl _MMXLength, %%edx \n\t"
4685 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4686 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4687 "jb sub_4lp \n\t"
4688
4689 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4690 "=D" (dummy_value_D) // 1
4691
4692 : "0" (bpp), // eax // input regs
4693 "1" (row) // edi
4694
4695 : "%edx", "%esi" // clobber list
4696#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4697 , "%mm0", "%mm1"
4698#endif
4699 );
4700 }
4701 break;
4702
4703 case 2:
4704 {
4705 _ActiveMask.use = 0x00000000ffff0000LL;
4706 _ShiftBpp.use = 16; // == 2 * 8
4707 _ShiftRem.use = 48; // == 64 - 16
4708
4709 __asm__ __volatile__ (
4710 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4711 // active byte group
4712 "movl _dif, %%edx \n\t"
4713 "movq %%mm7, %%mm6 \n\t"
4714// preload "movl row, %%edi \n\t"
4715 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4716 // 3rd active byte group
4717 "movl %%edi, %%esi \n\t" // lp = row
4718 "movq %%mm6, %%mm5 \n\t"
4719// preload "movl bpp, %%eax \n\t"
4720 "addl %%eax, %%edi \n\t" // rp = row + bpp
4721 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4722 // 4th active byte group
4723 // prime the pump: load the first Raw(x-bpp) data set
4724 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4725
4726 "sub_2lp: \n\t" // shift data for adding first
4727 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4728 // shift clears inactive bytes)
4729 // add 1st active group
4730 "movq (%%edi,%%edx,), %%mm0 \n\t"
4731 "paddb %%mm1, %%mm0 \n\t"
4732
4733 // add 2nd active group
4734 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4735 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4736 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4737 "paddb %%mm1, %%mm0 \n\t"
4738
4739 // add 3rd active group
4740 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4741 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4742 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4743 "paddb %%mm1, %%mm0 \n\t"
4744
4745 // add 4th active group
4746 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4747 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4748 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4749 "addl $8, %%edx \n\t"
4750 "paddb %%mm1, %%mm0 \n\t"
4751 "cmpl _MMXLength, %%edx \n\t"
4752 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4753 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4754 "jb sub_2lp \n\t"
4755
4756 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4757 "=D" (dummy_value_D) // 1
4758
4759 : "0" (bpp), // eax // input regs
4760 "1" (row) // edi
4761
4762 : "%edx", "%esi" // clobber list
4763#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4764 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4765#endif
4766 );
4767 }
4768 break;
4769
4770 case 8:
4771 {
4772 __asm__ __volatile__ (
4773// preload "movl row, %%edi \n\t"
4774 "movl _dif, %%edx \n\t"
4775 "movl %%edi, %%esi \n\t" // lp = row
4776// preload "movl bpp, %%eax \n\t"
4777 "addl %%eax, %%edi \n\t" // rp = row + bpp
4778 "movl _MMXLength, %%ecx \n\t"
4779
4780 // prime the pump: load the first Raw(x-bpp) data set
4781 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4782 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4783
4784 "sub_8lp: \n\t"
4785 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4786 "paddb %%mm7, %%mm0 \n\t"
4787 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4788 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4789
4790 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4791 // This will be repeated for each group of 8 bytes with the 8th
4792 // group being used as the Raw(x-bpp) for the 1st group of the
4793 // next loop.
4794
4795 "paddb %%mm0, %%mm1 \n\t"
4796 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4797 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4798 "paddb %%mm1, %%mm2 \n\t"
4799 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4800 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4801 "paddb %%mm2, %%mm3 \n\t"
4802 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4803 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4804 "paddb %%mm3, %%mm4 \n\t"
4805 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4806 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4807 "paddb %%mm4, %%mm5 \n\t"
4808 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4809 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4810 "paddb %%mm5, %%mm6 \n\t"
4811 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4812 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4813 "addl $64, %%edx \n\t"
4814 "paddb %%mm6, %%mm7 \n\t"
4815 "cmpl %%ecx, %%edx \n\t"
4816 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4817 "jb sub_8lp \n\t"
4818
4819 "cmpl _MMXLength, %%edx \n\t"
4820 "jnb sub_8lt8 \n\t"
4821
4822 "sub_8lpA: \n\t"
4823 "movq (%%edi,%%edx,), %%mm0 \n\t"
4824 "addl $8, %%edx \n\t"
4825 "paddb %%mm7, %%mm0 \n\t"
4826 "cmpl _MMXLength, %%edx \n\t"
4827 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4828 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4829 // to mm1 to be new Raw(x-bpp)
4830 // for next loop
4831 "jb sub_8lpA \n\t"
4832
4833 "sub_8lt8: \n\t"
4834
4835 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4836 "=D" (dummy_value_D) // 1
4837
4838 : "0" (bpp), // eax // input regs
4839 "1" (row) // edi
4840
4841 : "%ecx", "%edx", "%esi" // clobber list
4842#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4843 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4844#endif
4845 );
4846 }
4847 break;
4848
4849 default: // bpp greater than 8 bytes GRR BOGUS
4850 {
4851 __asm__ __volatile__ (
4852 "movl _dif, %%edx \n\t"
4853// preload "movl row, %%edi \n\t"
4854 "movl %%edi, %%esi \n\t" // lp = row
4855// preload "movl bpp, %%eax \n\t"
4856 "addl %%eax, %%edi \n\t" // rp = row + bpp
4857
4858 "sub_Alp: \n\t"
4859 "movq (%%edi,%%edx,), %%mm0 \n\t"
4860 "movq (%%esi,%%edx,), %%mm1 \n\t"
4861 "addl $8, %%edx \n\t"
4862 "paddb %%mm1, %%mm0 \n\t"
4863 "cmpl _MMXLength, %%edx \n\t"
4864 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4865 // -8 to offset addl edx
4866 "jb sub_Alp \n\t"
4867
4868 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4869 "=D" (dummy_value_D) // 1
4870
4871 : "0" (bpp), // eax // input regs
4872 "1" (row) // edi
4873
4874 : "%edx", "%esi" // clobber list
4875#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4876 , "%mm0", "%mm1"
4877#endif
4878 );
4879 }
4880 break;
4881
4882 } // end switch (bpp)
4883
4884 __asm__ __volatile__ (
4885 "movl _MMXLength, %%edx \n\t"
4886//pre "movl row, %%edi \n\t"
4887 "cmpl _FullLength, %%edx \n\t"
4888 "jnb sub_end \n\t"
4889
4890 "movl %%edi, %%esi \n\t" // lp = row
4891//pre "movl bpp, %%eax \n\t"
4892 "addl %%eax, %%edi \n\t" // rp = row + bpp
4893 "xorl %%eax, %%eax \n\t"
4894
4895 "sub_lp2: \n\t"
4896 "movb (%%esi,%%edx,), %%al \n\t"
4897 "addb %%al, (%%edi,%%edx,) \n\t"
4898 "incl %%edx \n\t"
4899 "cmpl _FullLength, %%edx \n\t"
4900 "jb sub_lp2 \n\t"
4901
4902 "sub_end: \n\t"
4903 "EMMS \n\t" // end MMX instructions
4904
4905 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4906 "=D" (dummy_value_D) // 1
4907
4908 : "0" (bpp), // eax // input regs
4909 "1" (row) // edi
4910
4911 : "%edx", "%esi" // clobber list
4912 );
4913
4914} // end of png_read_filter_row_mmx_sub()
4915#endif
4916
4917
4918
4919
4920//===========================================================================//
4921// //
4922// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4923// //
4924//===========================================================================//
4925
4926// Optimized code for PNG Up filter decoder
4927
4928static void /* PRIVATE */
4929png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4930 png_bytep prev_row)
4931{
4932 png_uint_32 len;
4933 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4934 int dummy_value_S;
4935 int dummy_value_D;
4936
4937 len = row_info->rowbytes; // number of bytes to filter
4938
4939 __asm__ __volatile__ (
4940//pre "movl row, %%edi \n\t"
4941 // get # of bytes to alignment
4942#ifdef __PIC__
4943 "pushl %%ebx \n\t"
4944#endif
4945 "movl %%edi, %%ecx \n\t"
4946 "xorl %%ebx, %%ebx \n\t"
4947 "addl $0x7, %%ecx \n\t"
4948 "xorl %%eax, %%eax \n\t"
4949 "andl $0xfffffff8, %%ecx \n\t"
4950//pre "movl prev_row, %%esi \n\t"
4951 "subl %%edi, %%ecx \n\t"
4952 "jz up_go \n\t"
4953
4954 "up_lp1: \n\t" // fix alignment
4955 "movb (%%edi,%%ebx,), %%al \n\t"
4956 "addb (%%esi,%%ebx,), %%al \n\t"
4957 "incl %%ebx \n\t"
4958 "cmpl %%ecx, %%ebx \n\t"
4959 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4960 "jb up_lp1 \n\t" // offset incl ebx
4961
4962 "up_go: \n\t"
4963//pre "movl len, %%edx \n\t"
4964 "movl %%edx, %%ecx \n\t"
4965 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4966 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4967 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4968
4969 // unrolled loop - use all MMX registers and interleave to reduce
4970 // number of branch instructions (loops) and reduce partial stalls
4971 "up_loop: \n\t"
4972 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4973 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4974 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4975 "paddb %%mm1, %%mm0 \n\t"
4976 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4977 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4978 "paddb %%mm3, %%mm2 \n\t"
4979 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4980 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4981 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4982 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4983 "paddb %%mm5, %%mm4 \n\t"
4984 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4985 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4986 "paddb %%mm7, %%mm6 \n\t"
4987 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4988 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4989 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4990 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4991 "paddb %%mm1, %%mm0 \n\t"
4992 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4993 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4994 "paddb %%mm3, %%mm2 \n\t"
4995 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4996 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4997 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4998 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4999 "paddb %%mm5, %%mm4 \n\t"
5000 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5001 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5002 "addl $64, %%ebx \n\t"
5003 "paddb %%mm7, %%mm6 \n\t"
5004 "cmpl %%ecx, %%ebx \n\t"
5005 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5006 "jb up_loop \n\t" // -8 to offset addl ebx
5007
5008 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5009 "jz up_end \n\t"
5010
5011 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
5012 "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5013
5014 "addl %%edx, %%ecx \n\t"
5015 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5016 "subl %%edx, %%ecx \n\t" // drop over bytes from length
5017 "jz up_lt8 \n\t"
5018
5019 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5020 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5021 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5022 "addl $8, %%ebx \n\t"
5023 "paddb %%mm1, %%mm0 \n\t"
5024 "cmpl %%ecx, %%ebx \n\t"
5025 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5026 "jb up_lpA \n\t" // offset add ebx
5027 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5028 "jz up_end \n\t"
5029
5030 "up_lt8: \n\t"
5031 "xorl %%eax, %%eax \n\t"
5032 "addl %%edx, %%ecx \n\t" // move over byte count into counter
5033
5034 "up_lp2: \n\t" // use x86 regs for remaining bytes
5035 "movb (%%edi,%%ebx,), %%al \n\t"
5036 "addb (%%esi,%%ebx,), %%al \n\t"
5037 "incl %%ebx \n\t"
5038 "cmpl %%ecx, %%ebx \n\t"
5039 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5040 "jb up_lp2 \n\t" // offset inc ebx
5041
5042 "up_end: \n\t"
5043 "EMMS \n\t" // conversion of filtered row complete
5044#ifdef __PIC__
5045 "popl %%ebx \n\t"
5046#endif
5047
5048 : "=d" (dummy_value_d), // 0 // output regs (dummy)
5049 "=S" (dummy_value_S), // 1
5050 "=D" (dummy_value_D) // 2
5051
5052 : "0" (len), // edx // input regs
5053 "1" (prev_row), // esi
5054 "2" (row) // edi
5055
5056 : "%eax", "%ecx" // clobber list (no input regs!)
5057#ifndef __PIC__
5058 , "%ebx"
5059#endif
5060
5061#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5062 , "%mm0", "%mm1", "%mm2", "%mm3"
5063 , "%mm4", "%mm5", "%mm6", "%mm7"
5064#endif
5065 );
5066
5067} // end of png_read_filter_row_mmx_up()
5068
5069#endif /* PNG_MMX_CODE_SUPPORTED */
5070
5071
5072
5073
5074/*===========================================================================*/
5075/* */
5076/* P N G _ R E A D _ F I L T E R _ R O W */
5077/* */
5078/*===========================================================================*/
5079
5080
5081/* Optimized png_read_filter_row routines */
5082
5083void /* PRIVATE */
5084png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5085 row, png_bytep prev_row, int filter)
5086{
5087#ifdef PNG_DEBUG
5088 char filnm[10];
5089#endif
5090
5091#if defined(PNG_MMX_CODE_SUPPORTED)
5092/* GRR: these are superseded by png_ptr->asm_flags: */
5093#define UseMMX_sub 1 // GRR: converted 20000730
5094#define UseMMX_up 1 // GRR: converted 20000729
5095#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5096#define UseMMX_paeth 1 // GRR: converted 20000828
5097
5098 if (_mmx_supported == 2) {
5099 /* this should have happened in png_init_mmx_flags() already */
5100#if !defined(PNG_1_0_X)
5101 png_warning(png_ptr, "asm_flags may not have been initialized");
5102#endif
5103 png_mmx_support();
5104 }
5105#endif /* PNG_MMX_CODE_SUPPORTED */
5106
5107#ifdef PNG_DEBUG
5108 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5109 switch (filter)
5110 {
5111 case 0: sprintf(filnm, "none");
5112 break;
5113 case 1: sprintf(filnm, "sub-%s",
5114#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5115#if !defined(PNG_1_0_X)
5116 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5117#endif
5118#endif
5119"x86");
5120 break;
5121 case 2: sprintf(filnm, "up-%s",
5122#ifdef PNG_MMX_CODE_SUPPORTED
5123#if !defined(PNG_1_0_X)
5124 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5125#endif
5126#endif
5127 "x86");
5128 break;
5129 case 3: sprintf(filnm, "avg-%s",
5130#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5131#if !defined(PNG_1_0_X)
5132 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5133#endif
5134#endif
5135 "x86");
5136 break;
5137 case 4: sprintf(filnm, "Paeth-%s",
5138#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5139#if !defined(PNG_1_0_X)
5140 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5141#endif
5142#endif
5143"x86");
5144 break;
5145 default: sprintf(filnm, "unknw");
5146 break;
5147 }
5148 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5149 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5150 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5151 (int)((row_info->pixel_depth + 7) >> 3));
5152 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5153#endif /* PNG_DEBUG */
5154
5155 switch (filter)
5156 {
5157 case PNG_FILTER_VALUE_NONE:
5158 break;
5159
5160 case PNG_FILTER_VALUE_SUB:
5161#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5162#if !defined(PNG_1_0_X)
5163 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5164 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5165 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5166#else
5167 if (_mmx_supported)
5168#endif
5169 {
5170 png_read_filter_row_mmx_sub(row_info, row);
5171 }
5172 else
5173#endif /* PNG_MMX_CODE_SUPPORTED */
5174 {
5175 png_uint_32 i;
5176 png_uint_32 istop = row_info->rowbytes;
5177 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5178 png_bytep rp = row + bpp;
5179 png_bytep lp = row;
5180
5181 for (i = bpp; i < istop; i++)
5182 {
5183 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5184 rp++;
5185 }
5186 } /* end !UseMMX_sub */
5187 break;
5188
5189 case PNG_FILTER_VALUE_UP:
5190#if defined(PNG_MMX_CODE_SUPPORTED)
5191#if !defined(PNG_1_0_X)
5192 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5193 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5194 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5195#else
5196 if (_mmx_supported)
5197#endif
5198 {
5199 png_read_filter_row_mmx_up(row_info, row, prev_row);
5200 }
5201 else
5202#endif /* PNG_MMX_CODE_SUPPORTED */
5203 {
5204 png_uint_32 i;
5205 png_uint_32 istop = row_info->rowbytes;
5206 png_bytep rp = row;
5207 png_bytep pp = prev_row;
5208
5209 for (i = 0; i < istop; ++i)
5210 {
5211 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5212 rp++;
5213 }
5214 } /* end !UseMMX_up */
5215 break;
5216
5217 case PNG_FILTER_VALUE_AVG:
5218#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5219#if !defined(PNG_1_0_X)
5220 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5221 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5222 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5223#else
5224 if (_mmx_supported)
5225#endif
5226 {
5227 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5228 }
5229 else
5230#endif /* PNG_MMX_CODE_SUPPORTED */
5231 {
5232 png_uint_32 i;
5233 png_bytep rp = row;
5234 png_bytep pp = prev_row;
5235 png_bytep lp = row;
5236 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5237 png_uint_32 istop = row_info->rowbytes - bpp;
5238
5239 for (i = 0; i < bpp; i++)
5240 {
5241 *rp = (png_byte)(((int)(*rp) +
5242 ((int)(*pp++) >> 1)) & 0xff);
5243 rp++;
5244 }
5245
5246 for (i = 0; i < istop; i++)
5247 {
5248 *rp = (png_byte)(((int)(*rp) +
5249 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5250 rp++;
5251 }
5252 } /* end !UseMMX_avg */
5253 break;
5254
5255 case PNG_FILTER_VALUE_PAETH:
5256#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5257#if !defined(PNG_1_0_X)
5258 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5259 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5260 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5261#else
5262 if (_mmx_supported)
5263#endif
5264 {
5265 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5266 }
5267 else
5268#endif /* PNG_MMX_CODE_SUPPORTED */
5269 {
5270 png_uint_32 i;
5271 png_bytep rp = row;
5272 png_bytep pp = prev_row;
5273 png_bytep lp = row;
5274 png_bytep cp = prev_row;
5275 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5276 png_uint_32 istop = row_info->rowbytes - bpp;
5277
5278 for (i = 0; i < bpp; i++)
5279 {
5280 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5281 rp++;
5282 }
5283
5284 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5285 {
5286 int a, b, c, pa, pb, pc, p;
5287
5288 a = *lp++;
5289 b = *pp++;
5290 c = *cp++;
5291
5292 p = b - c;
5293 pc = a - c;
5294
5295#ifdef PNG_USE_ABS
5296 pa = abs(p);
5297 pb = abs(pc);
5298 pc = abs(p + pc);
5299#else
5300 pa = p < 0 ? -p : p;
5301 pb = pc < 0 ? -pc : pc;
5302 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5303#endif
5304
5305 /*
5306 if (pa <= pb && pa <= pc)
5307 p = a;
5308 else if (pb <= pc)
5309 p = b;
5310 else
5311 p = c;
5312 */
5313
5314 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5315
5316 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5317 rp++;
5318 }
5319 } /* end !UseMMX_paeth */
5320 break;
5321
5322 default:
5323 png_warning(png_ptr, "Ignoring bad row-filter type");
5324 *row=0;
5325 break;
5326 }
5327}
5328
5329#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5330
5331
5332/*===========================================================================*/
5333/* */
5334/* P N G _ M M X _ S U P P O R T */
5335/* */
5336/*===========================================================================*/
5337
5338/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5339 * (2) all instructions compile with gcc 2.7.2.3 and later
5340 * (3) the function is moved down here to prevent gcc from
5341 * inlining it in multiple places and then barfing be-
5342 * cause the ".NOT_SUPPORTED" label is multiply defined
5343 * [is there a way to signal that a *single* function should
5344 * not be inlined? is there a way to modify the label for
5345 * each inlined instance, e.g., by appending _1, _2, etc.?
5346 * maybe if don't use leading "." in label name? (nope...sigh)]
5347 */
5348
5349int PNGAPI
5350png_mmx_support(void)
5351{
5352#if defined(PNG_MMX_CODE_SUPPORTED)
5353 int result;
5354 __asm__ __volatile__ (
5355 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5356 "pushl %%ecx \n\t" // so does ecx...
5357 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5358// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5359// "pushf \n\t" // 16-bit pushf
5360 "pushfl \n\t" // save Eflag to stack
5361 "popl %%eax \n\t" // get Eflag from stack into eax
5362 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5363 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5364 "pushl %%eax \n\t" // save modified Eflag back to stack
5365// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5366// "popf \n\t" // 16-bit popf
5367 "popfl \n\t" // restore modified value to Eflag reg
5368 "pushfl \n\t" // save Eflag to stack
5369 "popl %%eax \n\t" // get Eflag from stack
5370 "pushl %%ecx \n\t" // save original Eflag to stack
5371 "popfl \n\t" // restore original Eflag
5372 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5373 "jz 0f \n\t" // if same, CPUID instr. is not supported
5374
5375 "xorl %%eax, %%eax \n\t" // set eax to zero
5376// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5377 "cpuid \n\t" // get the CPU identification info
5378 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5379 "jl 0f \n\t" // if eax is zero, MMX is not supported
5380
5381 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5382 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5383 // faster than the instruction "mov eax, 1"
5384 "cpuid \n\t" // get the CPU identification info again
5385 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5386 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5387 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5388
5389 "movl $1, %%eax \n\t" // set return value to 1
5390 "jmp 1f \n\t" // DONE: have MMX support
5391
5392 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5393 "movl $0, %%eax \n\t" // set return value to 0
5394 "1: \n\t" // .RETURN: target label for jump instructions
5395 "popl %%edx \n\t" // restore edx
5396 "popl %%ecx \n\t" // restore ecx
5397 "popl %%ebx \n\t" // restore ebx
5398
5399// "ret \n\t" // DONE: no MMX support
5400 // (fall through to standard C "ret")
5401
5402 : "=a" (result) // output list
5403
5404 : // any variables used on input (none)
5405
5406 // no clobber list
5407// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5408// , "memory" // if write to a variable gcc thought was in a reg
5409// , "cc" // "condition codes" (flag bits)
5410 );
5411 _mmx_supported = result;
5412#else
5413 _mmx_supported = 0;
5414#endif /* PNG_MMX_CODE_SUPPORTED */
5415
5416 return _mmx_supported;
5417}
5418
5419
5420#endif /* PNG_USE_PNGGCCRD */
Note: See TracBrowser for help on using the repository browser.