source: py-scraping/ClientForm.py@ 165

Last change on this file since 165 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 123.2 KB
Line 
1"""HTML form handling for web clients.
2
3ClientForm is a Python module for handling HTML forms on the client
4side, useful for parsing HTML forms, filling them in and returning the
5completed forms to the server. It has developed from a port of Gisle
6Aas' Perl module HTML::Form, from the libwww-perl library, but the
7interface is not the same.
8
9The most useful docstring is the one for HTMLForm.
10
11RFC 1866: HTML 2.0
12RFC 1867: Form-based File Upload in HTML
13RFC 2388: Returning Values from Forms: multipart/form-data
14HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
15HTML 4.01 Specification, W3C Recommendation 24 December 1999
16
17
18Copyright 2002-2007 John J. Lee <jjl@pobox.com>
19Copyright 2005 Gary Poster
20Copyright 2005 Zope Corporation
21Copyright 1998-2000 Gisle Aas.
22
23This code is free software; you can redistribute it and/or modify it
24under the terms of the BSD or ZPL 2.1 licenses (see the file
25COPYING.txt included with the distribution).
26
27"""
28
29# XXX
30# Remove parser testing hack
31# safeUrl()-ize action
32# Switch to unicode throughout (would be 0.3.x)
33# See Wichert Akkerman's 2004-01-22 message to c.l.py.
34# Add charset parameter to Content-type headers? How to find value??
35# Add some more functional tests
36# Especially single and multiple file upload on the internet.
37# Does file upload work when name is missing? Sourceforge tracker form
38# doesn't like it. Check standards, and test with Apache. Test
39# binary upload with Apache.
40# mailto submission & enctype text/plain
41# I'm not going to fix this unless somebody tells me what real servers
42# that want this encoding actually expect: If enctype is
43# application/x-www-form-urlencoded and there's a FILE control present.
44# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
45# 17.13.2), but I send "name=" ATM. What about multiple file upload??
46
47# Would be nice, but I'm not going to do it myself:
48# -------------------------------------------------
49# Maybe a 0.4.x?
50# Replace by_label etc. with moniker / selector concept. Allows, eg.,
51# a choice between selection by value / id / label / element
52# contents. Or choice between matching labels exactly or by
53# substring. Etc.
54# Remove deprecated methods.
55# ...what else?
56# Work on DOMForm.
57# XForms? Don't know if there's a need here.
58
59__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
60 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
61 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
62 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
63 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
64 'ParseFileEx', 'ParseResponse', 'ParseResponseEx', 'PasswordControl',
65 'RadioControl', 'ScalarControl', 'SelectControl',
66 'SubmitButtonControl', 'SubmitControl', 'TextControl',
67 'TextareaControl', 'XHTMLCompatibleFormParser']
68
69try: True
70except NameError:
71 True = 1
72 False = 0
73
74try: bool
75except NameError:
76 def bool(expr):
77 if expr: return True
78 else: return False
79
80try:
81 import logging
82 import inspect
83except ImportError:
84 def debug(msg, *args, **kwds):
85 pass
86else:
87 _logger = logging.getLogger("ClientForm")
88 OPTIMIZATION_HACK = True
89
90 def debug(msg, *args, **kwds):
91 if OPTIMIZATION_HACK:
92 return
93
94 caller_name = inspect.stack()[1][3]
95 extended_msg = '%%s %s' % msg
96 extended_args = (caller_name,) + args
97 debug = _logger.debug(extended_msg, *extended_args, **kwds)
98
99 def _show_debug_messages():
100 global OPTIMIZATION_HACK
101 OPTIMIZATION_HACK = False
102 _logger.setLevel(logging.DEBUG)
103 handler = logging.StreamHandler(sys.stdout)
104 handler.setLevel(logging.DEBUG)
105 _logger.addHandler(handler)
106
107import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
108 htmlentitydefs, re, random
109from cStringIO import StringIO
110
111import sgmllib
112# monkeypatch to fix http://www.python.org/sf/803422 :-(
113sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
114
115# HTMLParser.HTMLParser is recent, so live without it if it's not available
116# (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
117try:
118 import HTMLParser
119except ImportError:
120 HAVE_MODULE_HTMLPARSER = False
121else:
122 HAVE_MODULE_HTMLPARSER = True
123
124try:
125 import warnings
126except ImportError:
127 def deprecation(message, stack_offset=0):
128 pass
129else:
130 def deprecation(message, stack_offset=0):
131 warnings.warn(message, DeprecationWarning, stacklevel=3 + stack_offset)
132
133VERSION = "0.2.10"
134
135CHUNK = 1024 # size of chunks fed to parser, in bytes
136
137DEFAULT_ENCODING = "latin-1"
138
139class Missing: pass
140
141_compress_re = re.compile(r"\s+")
142def compress_text(text): return _compress_re.sub(" ", text.strip())
143
144def normalize_line_endings(text):
145 return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
146
147
148# This version of urlencode is from my Python 1.5.2 back-port of the
149# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
150# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
151def urlencode(query, doseq=False,):
152 """Encode a sequence of two-element tuples or dictionary into a URL query \
153string.
154
155 If any values in the query arg are sequences and doseq is true, each
156 sequence element is converted to a separate parameter.
157
158 If the query arg is a sequence of two-element tuples, the order of the
159 parameters in the output will match the order of parameters in the
160 input.
161 """
162
163 if hasattr(query, "items"):
164 # mapping objects
165 query = query.items()
166 else:
167 # it's a bother at times that strings and string-like objects are
168 # sequences...
169 try:
170 # non-sequence items should not work with len()
171 x = len(query)
172 # non-empty strings will fail this
173 if len(query) and type(query[0]) != types.TupleType:
174 raise TypeError()
175 # zero-length sequences of all types will get here and succeed,
176 # but that's a minor nit - since the original implementation
177 # allowed empty dicts that type of behavior probably should be
178 # preserved for consistency
179 except TypeError:
180 ty, va, tb = sys.exc_info()
181 raise TypeError("not a valid non-string sequence or mapping "
182 "object", tb)
183
184 l = []
185 if not doseq:
186 # preserve old behavior
187 for k, v in query:
188 k = urllib.quote_plus(str(k))
189 v = urllib.quote_plus(str(v))
190 l.append(k + '=' + v)
191 else:
192 for k, v in query:
193 k = urllib.quote_plus(str(k))
194 if type(v) == types.StringType:
195 v = urllib.quote_plus(v)
196 l.append(k + '=' + v)
197 elif type(v) == types.UnicodeType:
198 # is there a reasonable way to convert to ASCII?
199 # encode generates a string, but "replace" or "ignore"
200 # lose information and "strict" can raise UnicodeError
201 v = urllib.quote_plus(v.encode("ASCII", "replace"))
202 l.append(k + '=' + v)
203 else:
204 try:
205 # is this a sufficient test for sequence-ness?
206 x = len(v)
207 except TypeError:
208 # not a sequence
209 v = urllib.quote_plus(str(v))
210 l.append(k + '=' + v)
211 else:
212 # loop over the sequence
213 for elt in v:
214 l.append(k + '=' + urllib.quote_plus(str(elt)))
215 return '&'.join(l)
216
217def unescape(data, entities, encoding=DEFAULT_ENCODING):
218 if data is None or "&" not in data:
219 return data
220
221 def replace_entities(match, entities=entities, encoding=encoding):
222 ent = match.group()
223 if ent[1] == "#":
224 return unescape_charref(ent[2:-1], encoding)
225
226 repl = entities.get(ent)
227 if repl is not None:
228 if type(repl) != type(""):
229 try:
230 repl = repl.encode(encoding)
231 except UnicodeError:
232 repl = ent
233 else:
234 repl = ent
235
236 return repl
237
238 return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
239
240def unescape_charref(data, encoding):
241 name, base = data, 10
242 if name.startswith("x"):
243 name, base = name[1:], 16
244 uc = unichr(int(name, base))
245 if encoding is None:
246 return uc
247 else:
248 try:
249 repl = uc.encode(encoding)
250 except UnicodeError:
251 repl = "&#%s;" % data
252 return repl
253
254def get_entitydefs():
255 import htmlentitydefs
256 from codecs import latin_1_decode
257 entitydefs = {}
258 try:
259 htmlentitydefs.name2codepoint
260 except AttributeError:
261 entitydefs = {}
262 for name, char in htmlentitydefs.entitydefs.items():
263 uc = latin_1_decode(char)[0]
264 if uc.startswith("&#") and uc.endswith(";"):
265 uc = unescape_charref(uc[2:-1], None)
266 entitydefs["&%s;" % name] = uc
267 else:
268 for name, codepoint in htmlentitydefs.name2codepoint.items():
269 entitydefs["&%s;" % name] = unichr(codepoint)
270 return entitydefs
271
272
273def issequence(x):
274 try:
275 x[0]
276 except (TypeError, KeyError):
277 return False
278 except IndexError:
279 pass
280 return True
281
282def isstringlike(x):
283 try: x + ""
284 except: return False
285 else: return True
286
287
288def choose_boundary():
289 """Return a string usable as a multipart boundary."""
290 # follow IE and firefox
291 nonce = "".join([str(random.randint(0, sys.maxint - 1)) for i in 0, 1, 2])
292 return "-" * 27 + nonce
293
294# This cut-n-pasted MimeWriter from standard library is here so can add
295# to HTTP headers rather than message body when appropriate. It also uses
296# \r\n in place of \n. This is a bit nasty.
297class MimeWriter:
298
299 """Generic MIME writer.
300
301 Methods:
302
303 __init__()
304 addheader()
305 flushheaders()
306 startbody()
307 startmultipartbody()
308 nextpart()
309 lastpart()
310
311 A MIME writer is much more primitive than a MIME parser. It
312 doesn't seek around on the output file, and it doesn't use large
313 amounts of buffer space, so you have to write the parts in the
314 order they should occur on the output file. It does buffer the
315 headers you add, allowing you to rearrange their order.
316
317 General usage is:
318
319 f = <open the output file>
320 w = MimeWriter(f)
321 ...call w.addheader(key, value) 0 or more times...
322
323 followed by either:
324
325 f = w.startbody(content_type)
326 ...call f.write(data) for body data...
327
328 or:
329
330 w.startmultipartbody(subtype)
331 for each part:
332 subwriter = w.nextpart()
333 ...use the subwriter's methods to create the subpart...
334 w.lastpart()
335
336 The subwriter is another MimeWriter instance, and should be
337 treated in the same way as the toplevel MimeWriter. This way,
338 writing recursive body parts is easy.
339
340 Warning: don't forget to call lastpart()!
341
342 XXX There should be more state so calls made in the wrong order
343 are detected.
344
345 Some special cases:
346
347 - startbody() just returns the file passed to the constructor;
348 but don't use this knowledge, as it may be changed.
349
350 - startmultipartbody() actually returns a file as well;
351 this can be used to write the initial 'if you can read this your
352 mailer is not MIME-aware' message.
353
354 - If you call flushheaders(), the headers accumulated so far are
355 written out (and forgotten); this is useful if you don't need a
356 body part at all, e.g. for a subpart of type message/rfc822
357 that's (mis)used to store some header-like information.
358
359 - Passing a keyword argument 'prefix=<flag>' to addheader(),
360 start*body() affects where the header is inserted; 0 means
361 append at the end, 1 means insert at the start; default is
362 append for addheader(), but insert for start*body(), which use
363 it to determine where the Content-type header goes.
364
365 """
366
367 def __init__(self, fp, http_hdrs=None):
368 self._http_hdrs = http_hdrs
369 self._fp = fp
370 self._headers = []
371 self._boundary = []
372 self._first_part = True
373
374 def addheader(self, key, value, prefix=0,
375 add_to_http_hdrs=0):
376 """
377 prefix is ignored if add_to_http_hdrs is true.
378 """
379 lines = value.split("\r\n")
380 while lines and not lines[-1]: del lines[-1]
381 while lines and not lines[0]: del lines[0]
382 if add_to_http_hdrs:
383 value = "".join(lines)
384 # 2.2 urllib2 doesn't normalize header case
385 self._http_hdrs.append((key.capitalize(), value))
386 else:
387 for i in range(1, len(lines)):
388 lines[i] = " " + lines[i].strip()
389 value = "\r\n".join(lines) + "\r\n"
390 line = key.title() + ": " + value
391 if prefix:
392 self._headers.insert(0, line)
393 else:
394 self._headers.append(line)
395
396 def flushheaders(self):
397 self._fp.writelines(self._headers)
398 self._headers = []
399
400 def startbody(self, ctype=None, plist=[], prefix=1,
401 add_to_http_hdrs=0, content_type=1):
402 """
403 prefix is ignored if add_to_http_hdrs is true.
404 """
405 if content_type and ctype:
406 for name, value in plist:
407 ctype = ctype + ';\r\n %s=%s' % (name, value)
408 self.addheader("Content-Type", ctype, prefix=prefix,
409 add_to_http_hdrs=add_to_http_hdrs)
410 self.flushheaders()
411 if not add_to_http_hdrs: self._fp.write("\r\n")
412 self._first_part = True
413 return self._fp
414
415 def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
416 add_to_http_hdrs=0, content_type=1):
417 boundary = boundary or choose_boundary()
418 self._boundary.append(boundary)
419 return self.startbody("multipart/" + subtype,
420 [("boundary", boundary)] + plist,
421 prefix=prefix,
422 add_to_http_hdrs=add_to_http_hdrs,
423 content_type=content_type)
424
425 def nextpart(self):
426 boundary = self._boundary[-1]
427 if self._first_part:
428 self._first_part = False
429 else:
430 self._fp.write("\r\n")
431 self._fp.write("--" + boundary + "\r\n")
432 return self.__class__(self._fp)
433
434 def lastpart(self):
435 if self._first_part:
436 self.nextpart()
437 boundary = self._boundary.pop()
438 self._fp.write("\r\n--" + boundary + "--\r\n")
439
440
441class LocateError(ValueError): pass
442class AmbiguityError(LocateError): pass
443class ControlNotFoundError(LocateError): pass
444class ItemNotFoundError(LocateError): pass
445
446class ItemCountError(ValueError): pass
447
448# for backwards compatibility, ParseError derives from exceptions that were
449# raised by versions of ClientForm <= 0.2.5
450if HAVE_MODULE_HTMLPARSER:
451 SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
452 class ParseError(sgmllib.SGMLParseError,
453 HTMLParser.HTMLParseError,
454 ):
455 pass
456else:
457 if hasattr(sgmllib, "SGMLParseError"):
458 SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
459 class ParseError(sgmllib.SGMLParseError):
460 pass
461 else:
462 SGMLLIB_PARSEERROR = RuntimeError
463 class ParseError(RuntimeError):
464 pass
465
466
467class _AbstractFormParser:
468 """forms attribute contains HTMLForm instances on completion."""
469 # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
470 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
471 if entitydefs is None:
472 entitydefs = get_entitydefs()
473 self._entitydefs = entitydefs
474 self._encoding = encoding
475
476 self.base = None
477 self.forms = []
478 self.labels = []
479 self._current_label = None
480 self._current_form = None
481 self._select = None
482 self._optgroup = None
483 self._option = None
484 self._textarea = None
485
486 # forms[0] will contain all controls that are outside of any form
487 # self._global_form is an alias for self.forms[0]
488 self._global_form = None
489 self.start_form([])
490 self.end_form()
491 self._current_form = self._global_form = self.forms[0]
492
493 def do_base(self, attrs):
494 debug("%s", attrs)
495 for key, value in attrs:
496 if key == "href":
497 self.base = self.unescape_attr_if_required(value)
498
499 def end_body(self):
500 debug("")
501 if self._current_label is not None:
502 self.end_label()
503 if self._current_form is not self._global_form:
504 self.end_form()
505
506 def start_form(self, attrs):
507 debug("%s", attrs)
508 if self._current_form is not self._global_form:
509 raise ParseError("nested FORMs")
510 name = None
511 action = None
512 enctype = "application/x-www-form-urlencoded"
513 method = "GET"
514 d = {}
515 for key, value in attrs:
516 if key == "name":
517 name = self.unescape_attr_if_required(value)
518 elif key == "action":
519 action = self.unescape_attr_if_required(value)
520 elif key == "method":
521 method = self.unescape_attr_if_required(value.upper())
522 elif key == "enctype":
523 enctype = self.unescape_attr_if_required(value.lower())
524 d[key] = self.unescape_attr_if_required(value)
525 controls = []
526 self._current_form = (name, action, method, enctype), d, controls
527
528 def end_form(self):
529 debug("")
530 if self._current_label is not None:
531 self.end_label()
532 if self._current_form is self._global_form:
533 raise ParseError("end of FORM before start")
534 self.forms.append(self._current_form)
535 self._current_form = self._global_form
536
537 def start_select(self, attrs):
538 debug("%s", attrs)
539 if self._select is not None:
540 raise ParseError("nested SELECTs")
541 if self._textarea is not None:
542 raise ParseError("SELECT inside TEXTAREA")
543 d = {}
544 for key, val in attrs:
545 d[key] = self.unescape_attr_if_required(val)
546
547 self._select = d
548 self._add_label(d)
549
550 self._append_select_control({"__select": d})
551
552 def end_select(self):
553 debug("")
554 if self._select is None:
555 raise ParseError("end of SELECT before start")
556
557 if self._option is not None:
558 self._end_option()
559
560 self._select = None
561
562 def start_optgroup(self, attrs):
563 debug("%s", attrs)
564 if self._select is None:
565 raise ParseError("OPTGROUP outside of SELECT")
566 d = {}
567 for key, val in attrs:
568 d[key] = self.unescape_attr_if_required(val)
569
570 self._optgroup = d
571
572 def end_optgroup(self):
573 debug("")
574 if self._optgroup is None:
575 raise ParseError("end of OPTGROUP before start")
576 self._optgroup = None
577
578 def _start_option(self, attrs):
579 debug("%s", attrs)
580 if self._select is None:
581 raise ParseError("OPTION outside of SELECT")
582 if self._option is not None:
583 self._end_option()
584
585 d = {}
586 for key, val in attrs:
587 d[key] = self.unescape_attr_if_required(val)
588
589 self._option = {}
590 self._option.update(d)
591 if (self._optgroup and self._optgroup.has_key("disabled") and
592 not self._option.has_key("disabled")):
593 self._option["disabled"] = None
594
595 def _end_option(self):
596 debug("")
597 if self._option is None:
598 raise ParseError("end of OPTION before start")
599
600 contents = self._option.get("contents", "").strip()
601 self._option["contents"] = contents
602 if not self._option.has_key("value"):
603 self._option["value"] = contents
604 if not self._option.has_key("label"):
605 self._option["label"] = contents
606 # stuff dict of SELECT HTML attrs into a special private key
607 # (gets deleted again later)
608 self._option["__select"] = self._select
609 self._append_select_control(self._option)
610 self._option = None
611
612 def _append_select_control(self, attrs):
613 debug("%s", attrs)
614 controls = self._current_form[2]
615 name = self._select.get("name")
616 controls.append(("select", name, attrs))
617
618 def start_textarea(self, attrs):
619 debug("%s", attrs)
620 if self._textarea is not None:
621 raise ParseError("nested TEXTAREAs")
622 if self._select is not None:
623 raise ParseError("TEXTAREA inside SELECT")
624 d = {}
625 for key, val in attrs:
626 d[key] = self.unescape_attr_if_required(val)
627 self._add_label(d)
628
629 self._textarea = d
630
631 def end_textarea(self):
632 debug("")
633 if self._textarea is None:
634 raise ParseError("end of TEXTAREA before start")
635 controls = self._current_form[2]
636 name = self._textarea.get("name")
637 controls.append(("textarea", name, self._textarea))
638 self._textarea = None
639
640 def start_label(self, attrs):
641 debug("%s", attrs)
642 if self._current_label:
643 self.end_label()
644 d = {}
645 for key, val in attrs:
646 d[key] = self.unescape_attr_if_required(val)
647 taken = bool(d.get("for")) # empty id is invalid
648 d["__text"] = ""
649 d["__taken"] = taken
650 if taken:
651 self.labels.append(d)
652 self._current_label = d
653
654 def end_label(self):
655 debug("")
656 label = self._current_label
657 if label is None:
658 # something is ugly in the HTML, but we're ignoring it
659 return
660 self._current_label = None
661 # if it is staying around, it is True in all cases
662 del label["__taken"]
663
664 def _add_label(self, d):
665 #debug("%s", d)
666 if self._current_label is not None:
667 if not self._current_label["__taken"]:
668 self._current_label["__taken"] = True
669 d["__label"] = self._current_label
670
671 def handle_data(self, data):
672 debug("%s", data)
673
674 if self._option is not None:
675 # self._option is a dictionary of the OPTION element's HTML
676 # attributes, but it has two special keys, one of which is the
677 # special "contents" key contains text between OPTION tags (the
678 # other is the "__select" key: see the end_option method)
679 map = self._option
680 key = "contents"
681 elif self._textarea is not None:
682 map = self._textarea
683 key = "value"
684 data = normalize_line_endings(data)
685 # not if within option or textarea
686 elif self._current_label is not None:
687 map = self._current_label
688 key = "__text"
689 else:
690 return
691
692 if data and not map.has_key(key):
693 # according to
694 # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
695 # immediately after start tags or immediately before end tags must
696 # be ignored, but real browsers only ignore a line break after a
697 # start tag, so we'll do that.
698 if data[0:2] == "\r\n":
699 data = data[2:]
700 elif data[0:1] in ["\n", "\r"]:
701 data = data[1:]
702 map[key] = data
703 else:
704 map[key] = map[key] + data
705
706 def do_button(self, attrs):
707 debug("%s", attrs)
708 d = {}
709 d["type"] = "submit" # default
710 for key, val in attrs:
711 d[key] = self.unescape_attr_if_required(val)
712 controls = self._current_form[2]
713
714 type = d["type"]
715 name = d.get("name")
716 # we don't want to lose information, so use a type string that
717 # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
718 # e.g. type for BUTTON/RESET is "resetbutton"
719 # (type for INPUT/RESET is "reset")
720 type = type + "button"
721 self._add_label(d)
722 controls.append((type, name, d))
723
724 def do_input(self, attrs):
725 debug("%s", attrs)
726 d = {}
727 d["type"] = "text" # default
728 for key, val in attrs:
729 d[key] = self.unescape_attr_if_required(val)
730 controls = self._current_form[2]
731
732 type = d["type"]
733 name = d.get("name")
734 self._add_label(d)
735 controls.append((type, name, d))
736
737 def do_isindex(self, attrs):
738 debug("%s", attrs)
739 d = {}
740 for key, val in attrs:
741 d[key] = self.unescape_attr_if_required(val)
742 controls = self._current_form[2]
743
744 self._add_label(d)
745 # isindex doesn't have type or name HTML attributes
746 controls.append(("isindex", None, d))
747
748 def handle_entityref(self, name):
749 #debug("%s", name)
750 self.handle_data(unescape(
751 '&%s;' % name, self._entitydefs, self._encoding))
752
753 def handle_charref(self, name):
754 #debug("%s", name)
755 self.handle_data(unescape_charref(name, self._encoding))
756
757 def unescape_attr(self, name):
758 #debug("%s", name)
759 return unescape(name, self._entitydefs, self._encoding)
760
761 def unescape_attrs(self, attrs):
762 #debug("%s", attrs)
763 escaped_attrs = {}
764 for key, val in attrs.items():
765 try:
766 val.items
767 except AttributeError:
768 escaped_attrs[key] = self.unescape_attr(val)
769 else:
770 # e.g. "__select" -- yuck!
771 escaped_attrs[key] = self.unescape_attrs(val)
772 return escaped_attrs
773
774 def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
775 def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
776
777
778if not HAVE_MODULE_HTMLPARSER:
779 class XHTMLCompatibleFormParser:
780 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
781 raise ValueError("HTMLParser could not be imported")
782else:
783 class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
784 """Good for XHTML, bad for tolerance of incorrect HTML."""
785 # thanks to Michael Howitz for this!
786 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
787 HTMLParser.HTMLParser.__init__(self)
788 _AbstractFormParser.__init__(self, entitydefs, encoding)
789
790 def feed(self, data):
791 try:
792 HTMLParser.HTMLParser.feed(self, data)
793 except HTMLParser.HTMLParseError, exc:
794 raise ParseError(exc)
795
796 def start_option(self, attrs):
797 _AbstractFormParser._start_option(self, attrs)
798
799 def end_option(self):
800 _AbstractFormParser._end_option(self)
801
802 def handle_starttag(self, tag, attrs):
803 try:
804 method = getattr(self, "start_" + tag)
805 except AttributeError:
806 try:
807 method = getattr(self, "do_" + tag)
808 except AttributeError:
809 pass # unknown tag
810 else:
811 method(attrs)
812 else:
813 method(attrs)
814
815 def handle_endtag(self, tag):
816 try:
817 method = getattr(self, "end_" + tag)
818 except AttributeError:
819 pass # unknown tag
820 else:
821 method()
822
823 def unescape(self, name):
824 # Use the entitydefs passed into constructor, not
825 # HTMLParser.HTMLParser's entitydefs.
826 return self.unescape_attr(name)
827
828 def unescape_attr_if_required(self, name):
829 return name # HTMLParser.HTMLParser already did it
830 def unescape_attrs_if_required(self, attrs):
831 return attrs # ditto
832
833 def close(self):
834 HTMLParser.HTMLParser.close(self)
835 self.end_body()
836
837
838class _AbstractSgmllibParser(_AbstractFormParser):
839
840 def do_option(self, attrs):
841 _AbstractFormParser._start_option(self, attrs)
842
843 if sys.version_info[:2] >= (2, 5):
844 # we override this attr to decode hex charrefs
845 entity_or_charref = re.compile(
846 '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
847 def convert_entityref(self, name):
848 return unescape("&%s;" % name, self._entitydefs, self._encoding)
849 def convert_charref(self, name):
850 return unescape_charref("%s" % name, self._encoding)
851 def unescape_attr_if_required(self, name):
852 return name # sgmllib already did it
853 def unescape_attrs_if_required(self, attrs):
854 return attrs # ditto
855 else:
856 def unescape_attr_if_required(self, name):
857 return self.unescape_attr(name)
858 def unescape_attrs_if_required(self, attrs):
859 return self.unescape_attrs(attrs)
860
861
862class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
863 """Good for tolerance of incorrect HTML, bad for XHTML."""
864 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
865 sgmllib.SGMLParser.__init__(self)
866 _AbstractFormParser.__init__(self, entitydefs, encoding)
867
868 def feed(self, data):
869 try:
870 sgmllib.SGMLParser.feed(self, data)
871 except SGMLLIB_PARSEERROR, exc:
872 raise ParseError(exc)
873
874 def close(self):
875 sgmllib.SGMLParser.close(self)
876 self.end_body()
877
878
879# sigh, must support mechanize by allowing dynamic creation of classes based on
880# its bundled copy of BeautifulSoup (which was necessary because of dependency
881# problems)
882
883def _create_bs_classes(bs,
884 icbinbs,
885 ):
886 class _AbstractBSFormParser(_AbstractSgmllibParser):
887 bs_base_class = None
888 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
889 _AbstractFormParser.__init__(self, entitydefs, encoding)
890 self.bs_base_class.__init__(self)
891 def handle_data(self, data):
892 _AbstractFormParser.handle_data(self, data)
893 self.bs_base_class.handle_data(self, data)
894 def feed(self, data):
895 try:
896 self.bs_base_class.feed(self, data)
897 except SGMLLIB_PARSEERROR, exc:
898 raise ParseError(exc)
899 def close(self):
900 self.bs_base_class.close(self)
901 self.end_body()
902
903 class RobustFormParser(_AbstractBSFormParser, bs):
904 """Tries to be highly tolerant of incorrect HTML."""
905 pass
906 RobustFormParser.bs_base_class = bs
907 class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
908 """Tries to be highly tolerant of incorrect HTML.
909
910 Different from RobustFormParser in that it more often guesses nesting
911 above missing end tags (see BeautifulSoup docs).
912
913 """
914 pass
915 NestingRobustFormParser.bs_base_class = icbinbs
916
917 return RobustFormParser, NestingRobustFormParser
918
919try:
920 if sys.version_info[:2] < (2, 2):
921 raise ImportError # BeautifulSoup uses generators
922 import BeautifulSoup
923except ImportError:
924 pass
925else:
926 RobustFormParser, NestingRobustFormParser = _create_bs_classes(
927 BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
928 )
929 __all__ += ['RobustFormParser', 'NestingRobustFormParser']
930
931
932#FormParser = XHTMLCompatibleFormParser # testing hack
933#FormParser = RobustFormParser # testing hack
934
935
936def ParseResponseEx(response,
937 select_default=False,
938 form_parser_class=FormParser,
939 request_class=urllib2.Request,
940 entitydefs=None,
941 encoding=DEFAULT_ENCODING,
942
943 # private
944 _urljoin=urlparse.urljoin,
945 _urlparse=urlparse.urlparse,
946 _urlunparse=urlparse.urlunparse,
947 ):
948 """Identical to ParseResponse, except that:
949
950 1. The returned list contains an extra item. The first form in the list
951 contains all controls not contained in any FORM element.
952
953 2. The arguments ignore_errors and backwards_compat have been removed.
954
955 3. Backwards-compatibility mode (backwards_compat=True) is not available.
956 """
957 return _ParseFileEx(response, response.geturl(),
958 select_default,
959 False,
960 form_parser_class,
961 request_class,
962 entitydefs,
963 False,
964 encoding,
965 _urljoin=_urljoin,
966 _urlparse=_urlparse,
967 _urlunparse=_urlunparse,
968 )
969
970def ParseFileEx(file, base_uri,
971 select_default=False,
972 form_parser_class=FormParser,
973 request_class=urllib2.Request,
974 entitydefs=None,
975 encoding=DEFAULT_ENCODING,
976
977 # private
978 _urljoin=urlparse.urljoin,
979 _urlparse=urlparse.urlparse,
980 _urlunparse=urlparse.urlunparse,
981 ):
982 """Identical to ParseFile, except that:
983
984 1. The returned list contains an extra item. The first form in the list
985 contains all controls not contained in any FORM element.
986
987 2. The arguments ignore_errors and backwards_compat have been removed.
988
989 3. Backwards-compatibility mode (backwards_compat=True) is not available.
990 """
991 return _ParseFileEx(file, base_uri,
992 select_default,
993 False,
994 form_parser_class,
995 request_class,
996 entitydefs,
997 False,
998 encoding,
999 _urljoin=_urljoin,
1000 _urlparse=_urlparse,
1001 _urlunparse=_urlunparse,
1002 )
1003
1004def ParseResponse(response, *args, **kwds):
1005 """Parse HTTP response and return a list of HTMLForm instances.
1006
1007 The return value of urllib2.urlopen can be conveniently passed to this
1008 function as the response parameter.
1009
1010 ClientForm.ParseError is raised on parse errors.
1011
1012 response: file-like object (supporting read() method) with a method
1013 geturl(), returning the URI of the HTTP response
1014 select_default: for multiple-selection SELECT controls and RADIO controls,
1015 pick the first item as the default if none are selected in the HTML
1016 form_parser_class: class to instantiate and use to pass
1017 request_class: class to return from .click() method (default is
1018 urllib2.Request)
1019 entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
1020 definitions (a sensible default is used)
1021 encoding: character encoding used for encoding numeric character references
1022 when matching link text. ClientForm does not attempt to find the encoding
1023 in a META HTTP-EQUIV attribute in the document itself (mechanize, for
1024 example, does do that and will pass the correct value to ClientForm using
1025 this parameter).
1026
1027 backwards_compat: boolean that determines whether the returned HTMLForm
1028 objects are backwards-compatible with old code. If backwards_compat is
1029 true:
1030
1031 - ClientForm 0.1 code will continue to work as before.
1032
1033 - Label searches that do not specify a nr (number or count) will always
1034 get the first match, even if other controls match. If
1035 backwards_compat is False, label searches that have ambiguous results
1036 will raise an AmbiguityError.
1037
1038 - Item label matching is done by strict string comparison rather than
1039 substring matching.
1040
1041 - De-selecting individual list items is allowed even if the Item is
1042 disabled.
1043
1044 The backwards_compat argument will be deprecated in a future release.
1045
1046 Pass a true value for select_default if you want the behaviour specified by
1047 RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
1048 RADIO or multiple-selection SELECT control if none were selected in the
1049 HTML. Most browsers (including Microsoft Internet Explorer (IE) and
1050 Netscape Navigator) instead leave all items unselected in these cases. The
1051 W3C HTML 4.0 standard leaves this behaviour undefined in the case of
1052 multiple-selection SELECT controls, but insists that at least one RADIO
1053 button should be checked at all times, in contradiction to browser
1054 behaviour.
1055
1056 There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
1057 HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
1058 sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
1059 Note that HTMLParser is only available in Python 2.2 and later. You can
1060 pass your own class in here as a hack to work around bad HTML, but at your
1061 own risk: there is no well-defined interface.
1062
1063 """
1064 return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
1065
1066def ParseFile(file, base_uri, *args, **kwds):
1067 """Parse HTML and return a list of HTMLForm instances.
1068
1069 ClientForm.ParseError is raised on parse errors.
1070
1071 file: file-like object (supporting read() method) containing HTML with zero
1072 or more forms to be parsed
1073 base_uri: the URI of the document (note that the base URI used to submit
1074 the form will be that given in the BASE element if present, not that of
1075 the document)
1076
1077 For the other arguments and further details, see ParseResponse.__doc__.
1078
1079 """
1080 return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
1081
1082def _ParseFileEx(file, base_uri,
1083 select_default=False,
1084 ignore_errors=False,
1085 form_parser_class=FormParser,
1086 request_class=urllib2.Request,
1087 entitydefs=None,
1088 backwards_compat=True,
1089 encoding=DEFAULT_ENCODING,
1090 _urljoin=urlparse.urljoin,
1091 _urlparse=urlparse.urlparse,
1092 _urlunparse=urlparse.urlunparse,
1093 ):
1094 if backwards_compat:
1095 deprecation("operating in backwards-compatibility mode", 1)
1096 fp = form_parser_class(entitydefs, encoding)
1097 while 1:
1098 data = file.read(CHUNK)
1099 try:
1100 fp.feed(data)
1101 except ParseError, e:
1102 e.base_uri = base_uri
1103 raise
1104 if len(data) != CHUNK: break
1105 fp.close()
1106 if fp.base is not None:
1107 # HTML BASE element takes precedence over document URI
1108 base_uri = fp.base
1109 labels = [] # Label(label) for label in fp.labels]
1110 id_to_labels = {}
1111 for l in fp.labels:
1112 label = Label(l)
1113 labels.append(label)
1114 for_id = l["for"]
1115 coll = id_to_labels.get(for_id)
1116 if coll is None:
1117 id_to_labels[for_id] = [label]
1118 else:
1119 coll.append(label)
1120 forms = []
1121 for (name, action, method, enctype), attrs, controls in fp.forms:
1122 if action is None:
1123 action = base_uri
1124 else:
1125 action = _urljoin(base_uri, action)
1126 # would be nice to make HTMLForm class (form builder) pluggable
1127 form = HTMLForm(
1128 action, method, enctype, name, attrs, request_class,
1129 forms, labels, id_to_labels, backwards_compat)
1130 form._urlparse = _urlparse
1131 form._urlunparse = _urlunparse
1132 for ii in range(len(controls)):
1133 type, name, attrs = controls[ii]
1134 # index=ii*10 allows ImageControl to return multiple ordered pairs
1135 form.new_control(
1136 type, name, attrs, select_default=select_default, index=ii * 10)
1137 forms.append(form)
1138 for form in forms:
1139 form.fixup()
1140 return forms
1141
1142
1143class Label:
1144 def __init__(self, attrs):
1145 self.id = attrs.get("for")
1146 self._text = attrs.get("__text").strip()
1147 self._ctext = compress_text(self._text)
1148 self.attrs = attrs
1149 self._backwards_compat = False # maintained by HTMLForm
1150
1151 def __getattr__(self, name):
1152 if name == "text":
1153 if self._backwards_compat:
1154 return self._text
1155 else:
1156 return self._ctext
1157 return getattr(Label, name)
1158
1159 def __setattr__(self, name, value):
1160 if name == "text":
1161 # don't see any need for this, so make it read-only
1162 raise AttributeError("text attribute is read-only")
1163 self.__dict__[name] = value
1164
1165 def __str__(self):
1166 return "<Label(id=%r, text=%r)>" % (self.id, self.text)
1167
1168
1169def _get_label(attrs):
1170 text = attrs.get("__label")
1171 if text is not None:
1172 return Label(text)
1173 else:
1174 return None
1175
1176class Control:
1177 """An HTML form control.
1178
1179 An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
1180 are accessed using the HTMLForm.find_control method or the
1181 HTMLForm.controls attribute.
1182
1183 Control instances are usually constructed using the ParseFile /
1184 ParseResponse functions. If you use those functions, you can ignore the
1185 rest of this paragraph. A Control is only properly initialised after the
1186 fixup method has been called. In fact, this is only strictly necessary for
1187 ListControl instances. This is necessary because ListControls are built up
1188 from ListControls each containing only a single item, and their initial
1189 value(s) can only be known after the sequence is complete.
1190
1191 The types and values that are acceptable for assignment to the value
1192 attribute are defined by subclasses.
1193
1194 If the disabled attribute is true, this represents the state typically
1195 represented by browsers by 'greying out' a control. If the disabled
1196 attribute is true, the Control will raise AttributeError if an attempt is
1197 made to change its value. In addition, the control will not be considered
1198 'successful' as defined by the W3C HTML 4 standard -- ie. it will
1199 contribute no data to the return value of the HTMLForm.click* methods. To
1200 enable a control, set the disabled attribute to a false value.
1201
1202 If the readonly attribute is true, the Control will raise AttributeError if
1203 an attempt is made to change its value. To make a control writable, set
1204 the readonly attribute to a false value.
1205
1206 All controls have the disabled and readonly attributes, not only those that
1207 may have the HTML attributes of the same names.
1208
1209 On assignment to the value attribute, the following exceptions are raised:
1210 TypeError, AttributeError (if the value attribute should not be assigned
1211 to, because the control is disabled, for example) and ValueError.
1212
1213 If the name or value attributes are None, or the value is an empty list, or
1214 if the control is disabled, the control is not successful.
1215
1216 Public attributes:
1217
1218 type: string describing type of control (see the keys of the
1219 HTMLForm.type2class dictionary for the allowable values) (readonly)
1220 name: name of control (readonly)
1221 value: current value of control (subclasses may allow a single value, a
1222 sequence of values, or either)
1223 disabled: disabled state
1224 readonly: readonly state
1225 id: value of id HTML attribute
1226
1227 """
1228 def __init__(self, type, name, attrs, index=None):
1229 """
1230 type: string describing type of control (see the keys of the
1231 HTMLForm.type2class dictionary for the allowable values)
1232 name: control name
1233 attrs: HTML attributes of control's HTML element
1234
1235 """
1236 raise NotImplementedError()
1237
1238 def add_to_form(self, form):
1239 self._form = form
1240 form.controls.append(self)
1241
1242 def fixup(self):
1243 pass
1244
1245 def is_of_kind(self, kind):
1246 raise NotImplementedError()
1247
1248 def clear(self):
1249 raise NotImplementedError()
1250
1251 def __getattr__(self, name): raise NotImplementedError()
1252 def __setattr__(self, name, value): raise NotImplementedError()
1253
1254 def pairs(self):
1255 """Return list of (key, value) pairs suitable for passing to urlencode.
1256 """
1257 return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
1258
1259 def _totally_ordered_pairs(self):
1260 """Return list of (key, value, index) tuples.
1261
1262 Like pairs, but allows preserving correct ordering even where several
1263 controls are involved.
1264
1265 """
1266 raise NotImplementedError()
1267
1268 def _write_mime_data(self, mw, name, value):
1269 """Write data for a subitem of this control to a MimeWriter."""
1270 # called by HTMLForm
1271 mw2 = mw.nextpart()
1272 mw2.addheader("Content-Disposition",
1273 'form-data; name="%s"' % name, 1)
1274 f = mw2.startbody(prefix=0)
1275 f.write(value)
1276
1277 def __str__(self):
1278 raise NotImplementedError()
1279
1280 def get_labels(self):
1281 """Return all labels (Label instances) for this control.
1282
1283 If the control was surrounded by a <label> tag, that will be the first
1284 label; all other labels, connected by 'for' and 'id', are in the order
1285 that appear in the HTML.
1286
1287 """
1288 res = []
1289 if self._label:
1290 res.append(self._label)
1291 if self.id:
1292 res.extend(self._form._id_to_labels.get(self.id, ()))
1293 return res
1294
1295
1296#---------------------------------------------------
1297class ScalarControl(Control):
1298 """Control whose value is not restricted to one of a prescribed set.
1299
1300 Some ScalarControls don't accept any value attribute. Otherwise, takes a
1301 single value, which must be string-like.
1302
1303 Additional read-only public attribute:
1304
1305 attrs: dictionary mapping the names of original HTML attributes of the
1306 control to their values
1307
1308 """
1309 def __init__(self, type, name, attrs, index=None):
1310 self._index = index
1311 self._label = _get_label(attrs)
1312 self.__dict__["type"] = type.lower()
1313 self.__dict__["name"] = name
1314 self._value = attrs.get("value")
1315 self.disabled = attrs.has_key("disabled")
1316 self.readonly = attrs.has_key("readonly")
1317 self.id = attrs.get("id")
1318
1319 self.attrs = attrs.copy()
1320
1321 self._clicked = False
1322
1323 self._urlparse = urlparse.urlparse
1324 self._urlunparse = urlparse.urlunparse
1325
1326 def __getattr__(self, name):
1327 if name == "value":
1328 return self.__dict__["_value"]
1329 else:
1330 raise AttributeError("%s instance has no attribute '%s'" %
1331 (self.__class__.__name__, name))
1332
1333 def __setattr__(self, name, value):
1334 if name == "value":
1335 if not isstringlike(value):
1336 raise TypeError("must assign a string")
1337 elif self.readonly:
1338 raise AttributeError("control '%s' is readonly" % self.name)
1339 elif self.disabled:
1340 raise AttributeError("control '%s' is disabled" % self.name)
1341 self.__dict__["_value"] = value
1342 elif name in ("name", "type"):
1343 raise AttributeError("%s attribute is readonly" % name)
1344 else:
1345 self.__dict__[name] = value
1346
1347 def _totally_ordered_pairs(self):
1348 name = self.name
1349 value = self.value
1350 if name is None or value is None or self.disabled:
1351 return []
1352 return [(self._index, name, value)]
1353
1354 def clear(self):
1355 if self.readonly:
1356 raise AttributeError("control '%s' is readonly" % self.name)
1357 self.__dict__["_value"] = None
1358
1359 def __str__(self):
1360 name = self.name
1361 value = self.value
1362 if name is None: name = "<None>"
1363 if value is None: value = "<None>"
1364
1365 infos = []
1366 if self.disabled: infos.append("disabled")
1367 if self.readonly: infos.append("readonly")
1368 info = ", ".join(infos)
1369 if info: info = " (%s)" % info
1370
1371 return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1372
1373
1374#---------------------------------------------------
1375class TextControl(ScalarControl):
1376 """Textual input control.
1377
1378 Covers:
1379
1380 INPUT/TEXT
1381 INPUT/PASSWORD
1382 INPUT/HIDDEN
1383 TEXTAREA
1384
1385 """
1386 def __init__(self, type, name, attrs, index=None):
1387 ScalarControl.__init__(self, type, name, attrs, index)
1388 if self.type == "hidden": self.readonly = True
1389 if self._value is None:
1390 self._value = ""
1391
1392 def is_of_kind(self, kind): return kind == "text"
1393
1394#---------------------------------------------------
1395class FileControl(ScalarControl):
1396 """File upload with INPUT TYPE=FILE.
1397
1398 The value attribute of a FileControl is always None. Use add_file instead.
1399
1400 Additional public method: add_file
1401
1402 """
1403
1404 def __init__(self, type, name, attrs, index=None):
1405 ScalarControl.__init__(self, type, name, attrs, index)
1406 self._value = None
1407 self._upload_data = []
1408
1409 def is_of_kind(self, kind): return kind == "file"
1410
1411 def clear(self):
1412 if self.readonly:
1413 raise AttributeError("control '%s' is readonly" % self.name)
1414 self._upload_data = []
1415
1416 def __setattr__(self, name, value):
1417 if name in ("value", "name", "type"):
1418 raise AttributeError("%s attribute is readonly" % name)
1419 else:
1420 self.__dict__[name] = value
1421
1422 def add_file(self, file_object, content_type=None, filename=None):
1423 if not hasattr(file_object, "read"):
1424 raise TypeError("file-like object must have read method")
1425 if content_type is not None and not isstringlike(content_type):
1426 raise TypeError("content type must be None or string-like")
1427 if filename is not None and not isstringlike(filename):
1428 raise TypeError("filename must be None or string-like")
1429 if content_type is None:
1430 content_type = "application/octet-stream"
1431 self._upload_data.append((file_object, content_type, filename))
1432
1433 def _totally_ordered_pairs(self):
1434 # XXX should it be successful even if unnamed?
1435 if self.name is None or self.disabled:
1436 return []
1437 return [(self._index, self.name, "")]
1438
1439 def _write_mime_data(self, mw, _name, _value):
1440 # called by HTMLForm
1441 # assert _name == self.name and _value == ''
1442 if len(self._upload_data) < 2:
1443 if len(self._upload_data) == 0:
1444 file_object = StringIO()
1445 content_type = "application/octet-stream"
1446 filename = ""
1447 else:
1448 file_object, content_type, filename = self._upload_data[0]
1449 if filename is None:
1450 filename = ""
1451 mw2 = mw.nextpart()
1452 fn_part = '; filename="%s"' % filename
1453 disp = 'form-data; name="%s"%s' % (self.name, fn_part)
1454 mw2.addheader("Content-Disposition", disp, prefix=1)
1455 fh = mw2.startbody(content_type, prefix=0)
1456 fh.write(file_object.read())
1457 else:
1458 # multiple files
1459 mw2 = mw.nextpart()
1460 disp = 'form-data; name="%s"' % self.name
1461 mw2.addheader("Content-Disposition", disp, prefix=1)
1462 fh = mw2.startmultipartbody("mixed", prefix=0)
1463 for file_object, content_type, filename in self._upload_data:
1464 mw3 = mw2.nextpart()
1465 if filename is None:
1466 filename = ""
1467 fn_part = '; filename="%s"' % filename
1468 disp = "file%s" % fn_part
1469 mw3.addheader("Content-Disposition", disp, prefix=1)
1470 fh2 = mw3.startbody(content_type, prefix=0)
1471 fh2.write(file_object.read())
1472 mw2.lastpart()
1473
1474 def __str__(self):
1475 name = self.name
1476 if name is None: name = "<None>"
1477
1478 if not self._upload_data:
1479 value = "<No files added>"
1480 else:
1481 value = []
1482 for file, ctype, filename in self._upload_data:
1483 if filename is None:
1484 value.append("<Unnamed file>")
1485 else:
1486 value.append(filename)
1487 value = ", ".join(value)
1488
1489 info = []
1490 if self.disabled: info.append("disabled")
1491 if self.readonly: info.append("readonly")
1492 info = ", ".join(info)
1493 if info: info = " (%s)" % info
1494
1495 return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1496
1497
1498#---------------------------------------------------
1499class IsindexControl(ScalarControl):
1500 """ISINDEX control.
1501
1502 ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
1503 part of regular HTML forms at all, and predates it. You're only allowed
1504 one ISINDEX per HTML document. ISINDEX and regular form submission are
1505 mutually exclusive -- either submit a form, or the ISINDEX.
1506
1507 Having said this, since ISINDEX controls may appear in forms (which is
1508 probably bad HTML), ParseFile / ParseResponse will include them in the
1509 HTMLForm instances it returns. You can set the ISINDEX's value, as with
1510 any other control (but note that ISINDEX controls have no name, so you'll
1511 need to use the type argument of set_value!). When you submit the form,
1512 the ISINDEX will not be successful (ie., no data will get returned to the
1513 server as a result of its presence), unless you click on the ISINDEX
1514 control, in which case the ISINDEX gets submitted instead of the form:
1515
1516 form.set_value("my isindex value", type="isindex")
1517 urllib2.urlopen(form.click(type="isindex"))
1518
1519 ISINDEX elements outside of FORMs are ignored. If you want to submit one
1520 by hand, do it like so:
1521
1522 url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
1523 result = urllib2.urlopen(url)
1524
1525 """
1526 def __init__(self, type, name, attrs, index=None):
1527 ScalarControl.__init__(self, type, name, attrs, index)
1528 if self._value is None:
1529 self._value = ""
1530
1531 def is_of_kind(self, kind): return kind in ["text", "clickable"]
1532
1533 def _totally_ordered_pairs(self):
1534 return []
1535
1536 def _click(self, form, coord, return_type, request_class=urllib2.Request):
1537 # Relative URL for ISINDEX submission: instead of "foo=bar+baz",
1538 # want "bar+baz".
1539 # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
1540 # deprecated in 4.01, but it should still say how to submit it).
1541 # Submission of ISINDEX is explained in the HTML 3.2 spec, though.
1542 parts = self._urlparse(form.action)
1543 rest, (query, frag) = parts[:-2], parts[-2:]
1544 parts = rest + (urllib.quote_plus(self.value), None)
1545 url = self._urlunparse(parts)
1546 req_data = url, None, []
1547
1548 if return_type == "pairs":
1549 return []
1550 elif return_type == "request_data":
1551 return req_data
1552 else:
1553 return request_class(url)
1554
1555 def __str__(self):
1556 value = self.value
1557 if value is None: value = "<None>"
1558
1559 infos = []
1560 if self.disabled: infos.append("disabled")
1561 if self.readonly: infos.append("readonly")
1562 info = ", ".join(infos)
1563 if info: info = " (%s)" % info
1564
1565 return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
1566
1567
1568#---------------------------------------------------
1569class IgnoreControl(ScalarControl):
1570 """Control that we're not interested in.
1571
1572 Covers:
1573
1574 INPUT/RESET
1575 BUTTON/RESET
1576 INPUT/BUTTON
1577 BUTTON/BUTTON
1578
1579 These controls are always unsuccessful, in the terminology of HTML 4 (ie.
1580 they never require any information to be returned to the server).
1581
1582 BUTTON/BUTTON is used to generate events for script embedded in HTML.
1583
1584 The value attribute of IgnoreControl is always None.
1585
1586 """
1587 def __init__(self, type, name, attrs, index=None):
1588 ScalarControl.__init__(self, type, name, attrs, index)
1589 self._value = None
1590
1591 def is_of_kind(self, kind): return False
1592
1593 def __setattr__(self, name, value):
1594 if name == "value":
1595 raise AttributeError(
1596 "control '%s' is ignored, hence read-only" % self.name)
1597 elif name in ("name", "type"):
1598 raise AttributeError("%s attribute is readonly" % name)
1599 else:
1600 self.__dict__[name] = value
1601
1602
1603#---------------------------------------------------
1604# ListControls
1605
1606# helpers and subsidiary classes
1607
1608class Item:
1609 def __init__(self, control, attrs, index=None):
1610 label = _get_label(attrs)
1611 self.__dict__.update({
1612 "name": attrs["value"],
1613 "_labels": label and [label] or [],
1614 "attrs": attrs,
1615 "_control": control,
1616 "disabled": attrs.has_key("disabled"),
1617 "_selected": False,
1618 "id": attrs.get("id"),
1619 "_index": index,
1620 })
1621 control.items.append(self)
1622
1623 def get_labels(self):
1624 """Return all labels (Label instances) for this item.
1625
1626 For items that represent radio buttons or checkboxes, if the item was
1627 surrounded by a <label> tag, that will be the first label; all other
1628 labels, connected by 'for' and 'id', are in the order that appear in
1629 the HTML.
1630
1631 For items that represent select options, if the option had a label
1632 attribute, that will be the first label. If the option has contents
1633 (text within the option tags) and it is not the same as the label
1634 attribute (if any), that will be a label. There is nothing in the
1635 spec to my knowledge that makes an option with an id unable to be the
1636 target of a label's for attribute, so those are included, if any, for
1637 the sake of consistency and completeness.
1638
1639 """
1640 res = []
1641 res.extend(self._labels)
1642 if self.id:
1643 res.extend(self._control._form._id_to_labels.get(self.id, ()))
1644 return res
1645
1646 def __getattr__(self, name):
1647 if name == "selected":
1648 return self._selected
1649 raise AttributeError(name)
1650
1651 def __setattr__(self, name, value):
1652 if name == "selected":
1653 self._control._set_selected_state(self, value)
1654 elif name == "disabled":
1655 self.__dict__["disabled"] = bool(value)
1656 else:
1657 raise AttributeError(name)
1658
1659 def __str__(self):
1660 res = self.name
1661 if self.selected:
1662 res = "*" + res
1663 if self.disabled:
1664 res = "(%s)" % res
1665 return res
1666
1667 def __repr__(self):
1668 # XXX appending the attrs without distinguishing them from name and id
1669 # is silly
1670 attrs = [("name", self.name), ("id", self.id)] + self.attrs.items()
1671 return "<%s %s>" % (
1672 self.__class__.__name__,
1673 " ".join(["%s=%r" % (k, v) for k, v in attrs])
1674 )
1675
1676def disambiguate(items, nr, **kwds):
1677 msgs = []
1678 for key, value in kwds.items():
1679 msgs.append("%s=%r" % (key, value))
1680 msg = " ".join(msgs)
1681 if not items:
1682 raise ItemNotFoundError(msg)
1683 if nr is None:
1684 if len(items) > 1:
1685 raise AmbiguityError(msg)
1686 nr = 0
1687 if len(items) <= nr:
1688 raise ItemNotFoundError(msg)
1689 return items[nr]
1690
1691class ListControl(Control):
1692 """Control representing a sequence of items.
1693
1694 The value attribute of a ListControl represents the successful list items
1695 in the control. The successful list items are those that are selected and
1696 not disabled.
1697
1698 ListControl implements both list controls that take a length-1 value
1699 (single-selection) and those that take length >1 values
1700 (multiple-selection).
1701
1702 ListControls accept sequence values only. Some controls only accept
1703 sequences of length 0 or 1 (RADIO, and single-selection SELECT).
1704 In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
1705 and multiple-selection SELECTs (those having the "multiple" HTML attribute)
1706 accept sequences of any length.
1707
1708 Note the following mistake:
1709
1710 control.value = some_value
1711 assert control.value == some_value # not necessarily true
1712
1713 The reason for this is that the value attribute always gives the list items
1714 in the order they were listed in the HTML.
1715
1716 ListControl items can also be referred to by their labels instead of names.
1717 Use the label argument to .get(), and the .set_value_by_label(),
1718 .get_value_by_label() methods.
1719
1720 Note that, rather confusingly, though SELECT controls are represented in
1721 HTML by SELECT elements (which contain OPTION elements, representing
1722 individual list items), CHECKBOXes and RADIOs are not represented by *any*
1723 element. Instead, those controls are represented by a collection of INPUT
1724 elements. For example, this is a SELECT control, named "control1":
1725
1726 <select name="control1">
1727 <option>foo</option>
1728 <option value="1">bar</option>
1729 </select>
1730
1731 and this is a CHECKBOX control, named "control2":
1732
1733 <input type="checkbox" name="control2" value="foo" id="cbe1">
1734 <input type="checkbox" name="control2" value="bar" id="cbe2">
1735
1736 The id attribute of a CHECKBOX or RADIO ListControl is always that of its
1737 first element (for example, "cbe1" above).
1738
1739
1740 Additional read-only public attribute: multiple.
1741
1742 """
1743
1744 # ListControls are built up by the parser from their component items by
1745 # creating one ListControl per item, consolidating them into a single
1746 # master ListControl held by the HTMLForm:
1747
1748 # -User calls form.new_control(...)
1749 # -Form creates Control, and calls control.add_to_form(self).
1750 # -Control looks for a Control with the same name and type in the form,
1751 # and if it finds one, merges itself with that control by calling
1752 # control.merge_control(self). The first Control added to the form, of
1753 # a particular name and type, is the only one that survives in the
1754 # form.
1755 # -Form calls control.fixup for all its controls. ListControls in the
1756 # form know they can now safely pick their default values.
1757
1758 # To create a ListControl without an HTMLForm, use:
1759
1760 # control.merge_control(new_control)
1761
1762 # (actually, it's much easier just to use ParseFile)
1763
1764 _label = None
1765
1766 def __init__(self, type, name, attrs={}, select_default=False,
1767 called_as_base_class=False, index=None):
1768 """
1769 select_default: for RADIO and multiple-selection SELECT controls, pick
1770 the first item as the default if no 'selected' HTML attribute is
1771 present
1772
1773 """
1774 if not called_as_base_class:
1775 raise NotImplementedError()
1776
1777 self.__dict__["type"] = type.lower()
1778 self.__dict__["name"] = name
1779 self._value = attrs.get("value")
1780 self.disabled = False
1781 self.readonly = False
1782 self.id = attrs.get("id")
1783 self._closed = False
1784
1785 # As Controls are merged in with .merge_control(), self.attrs will
1786 # refer to each Control in turn -- always the most recently merged
1787 # control. Each merged-in Control instance corresponds to a single
1788 # list item: see ListControl.__doc__.
1789 self.items = []
1790 self._form = None
1791
1792 self._select_default = select_default
1793 self._clicked = False
1794
1795 def clear(self):
1796 self.value = []
1797
1798 def is_of_kind(self, kind):
1799 if kind == "list":
1800 return True
1801 elif kind == "multilist":
1802 return bool(self.multiple)
1803 elif kind == "singlelist":
1804 return not self.multiple
1805 else:
1806 return False
1807
1808 def get_items(self, name=None, label=None, id=None,
1809 exclude_disabled=False):
1810 """Return matching items by name or label.
1811
1812 For argument docs, see the docstring for .get()
1813
1814 """
1815 if name is not None and not isstringlike(name):
1816 raise TypeError("item name must be string-like")
1817 if label is not None and not isstringlike(label):
1818 raise TypeError("item label must be string-like")
1819 if id is not None and not isstringlike(id):
1820 raise TypeError("item id must be string-like")
1821 items = [] # order is important
1822 compat = self._form.backwards_compat
1823 for o in self.items:
1824 if exclude_disabled and o.disabled:
1825 continue
1826 if name is not None and o.name != name:
1827 continue
1828 if label is not None:
1829 for l in o.get_labels():
1830 if ((compat and l.text == label) or
1831 (not compat and l.text.find(label) > -1)):
1832 break
1833 else:
1834 continue
1835 if id is not None and o.id != id:
1836 continue
1837 items.append(o)
1838 return items
1839
1840 def get(self, name=None, label=None, id=None, nr=None,
1841 exclude_disabled=False):
1842 """Return item by name or label, disambiguating if necessary with nr.
1843
1844 All arguments must be passed by name, with the exception of 'name',
1845 which may be used as a positional argument.
1846
1847 If name is specified, then the item must have the indicated name.
1848
1849 If label is specified, then the item must have a label whose
1850 whitespace-compressed, stripped, text substring-matches the indicated
1851 label string (eg. label="please choose" will match
1852 " Do please choose an item ").
1853
1854 If id is specified, then the item must have the indicated id.
1855
1856 nr is an optional 0-based index of the items matching the query.
1857
1858 If nr is the default None value and more than item is found, raises
1859 AmbiguityError (unless the HTMLForm instance's backwards_compat
1860 attribute is true).
1861
1862 If no item is found, or if items are found but nr is specified and not
1863 found, raises ItemNotFoundError.
1864
1865 Optionally excludes disabled items.
1866
1867 """
1868 if nr is None and self._form.backwards_compat:
1869 nr = 0 # :-/
1870 items = self.get_items(name, label, id, exclude_disabled)
1871 return disambiguate(items, nr, name=name, label=label, id=id)
1872
1873 def _get(self, name, by_label=False, nr=None, exclude_disabled=False):
1874 # strictly for use by deprecated methods
1875 if by_label:
1876 name, label = None, name
1877 else:
1878 name, label = name, None
1879 return self.get(name, label, nr, exclude_disabled)
1880
1881 def toggle(self, name, by_label=False, nr=None):
1882 """Deprecated: given a name or label and optional disambiguating index
1883 nr, toggle the matching item's selection.
1884
1885 Selecting items follows the behavior described in the docstring of the
1886 'get' method.
1887
1888 if the item is disabled, or this control is disabled or readonly,
1889 raise AttributeError.
1890
1891 """
1892 deprecation(
1893 "item = control.get(...); item.selected = not item.selected")
1894 o = self._get(name, by_label, nr)
1895 self._set_selected_state(o, not o.selected)
1896
1897 def set(self, selected, name, by_label=False, nr=None):
1898 """Deprecated: given a name or label and optional disambiguating index
1899 nr, set the matching item's selection to the bool value of selected.
1900
1901 Selecting items follows the behavior described in the docstring of the
1902 'get' method.
1903
1904 if the item is disabled, or this control is disabled or readonly,
1905 raise AttributeError.
1906
1907 """
1908 deprecation(
1909 "control.get(...).selected = <boolean>")
1910 self._set_selected_state(self._get(name, by_label, nr), selected)
1911
1912 def _set_selected_state(self, item, action):
1913 # action:
1914 # bool False: off
1915 # bool True: on
1916 if self.disabled:
1917 raise AttributeError("control '%s' is disabled" % self.name)
1918 if self.readonly:
1919 raise AttributeError("control '%s' is readonly" % self.name)
1920 action == bool(action)
1921 compat = self._form.backwards_compat
1922 if not compat and item.disabled:
1923 raise AttributeError("item is disabled")
1924 else:
1925 if compat and item.disabled and action:
1926 raise AttributeError("item is disabled")
1927 if self.multiple:
1928 item.__dict__["_selected"] = action
1929 else:
1930 if not action:
1931 item.__dict__["_selected"] = False
1932 else:
1933 for o in self.items:
1934 o.__dict__["_selected"] = False
1935 item.__dict__["_selected"] = True
1936
1937 def toggle_single(self, by_label=None):
1938 """Deprecated: toggle the selection of the single item in this control.
1939
1940 Raises ItemCountError if the control does not contain only one item.
1941
1942 by_label argument is ignored, and included only for backwards
1943 compatibility.
1944
1945 """
1946 deprecation(
1947 "control.items[0].selected = not control.items[0].selected")
1948 if len(self.items) != 1:
1949 raise ItemCountError(
1950 "'%s' is not a single-item control" % self.name)
1951 item = self.items[0]
1952 self._set_selected_state(item, not item.selected)
1953
1954 def set_single(self, selected, by_label=None):
1955 """Deprecated: set the selection of the single item in this control.
1956
1957 Raises ItemCountError if the control does not contain only one item.
1958
1959 by_label argument is ignored, and included only for backwards
1960 compatibility.
1961
1962 """
1963 deprecation(
1964 "control.items[0].selected = <boolean>")
1965 if len(self.items) != 1:
1966 raise ItemCountError(
1967 "'%s' is not a single-item control" % self.name)
1968 self._set_selected_state(self.items[0], selected)
1969
1970 def get_item_disabled(self, name, by_label=False, nr=None):
1971 """Get disabled state of named list item in a ListControl."""
1972 deprecation(
1973 "control.get(...).disabled")
1974 return self._get(name, by_label, nr).disabled
1975
1976 def set_item_disabled(self, disabled, name, by_label=False, nr=None):
1977 """Set disabled state of named list item in a ListControl.
1978
1979 disabled: boolean disabled state
1980
1981 """
1982 deprecation(
1983 "control.get(...).disabled = <boolean>")
1984 self._get(name, by_label, nr).disabled = disabled
1985
1986 def set_all_items_disabled(self, disabled):
1987 """Set disabled state of all list items in a ListControl.
1988
1989 disabled: boolean disabled state
1990
1991 """
1992 for o in self.items:
1993 o.disabled = disabled
1994
1995 def get_item_attrs(self, name, by_label=False, nr=None):
1996 """Return dictionary of HTML attributes for a single ListControl item.
1997
1998 The HTML element types that describe list items are: OPTION for SELECT
1999 controls, INPUT for the rest. These elements have HTML attributes that
2000 you may occasionally want to know about -- for example, the "alt" HTML
2001 attribute gives a text string describing the item (graphical browsers
2002 usually display this as a tooltip).
2003
2004 The returned dictionary maps HTML attribute names to values. The names
2005 and values are taken from the original HTML.
2006
2007 """
2008 deprecation(
2009 "control.get(...).attrs")
2010 return self._get(name, by_label, nr).attrs
2011
2012 def close_control(self):
2013 self._closed = True
2014
2015 def add_to_form(self, form):
2016 assert self._form is None or form == self._form, (
2017 "can't add control to more than one form")
2018 self._form = form
2019 if self.name is None:
2020 # always count nameless elements as separate controls
2021 Control.add_to_form(self, form)
2022 else:
2023 for ii in range(len(form.controls) - 1, -1, -1):
2024 control = form.controls[ii]
2025 if control.name == self.name and control.type == self.type:
2026 if control._closed:
2027 Control.add_to_form(self, form)
2028 else:
2029 control.merge_control(self)
2030 break
2031 else:
2032 Control.add_to_form(self, form)
2033
2034 def merge_control(self, control):
2035 assert bool(control.multiple) == bool(self.multiple)
2036 # usually, isinstance(control, self.__class__)
2037 self.items.extend(control.items)
2038
2039 def fixup(self):
2040 """
2041 ListControls are built up from component list items (which are also
2042 ListControls) during parsing. This method should be called after all
2043 items have been added. See ListControl.__doc__ for the reason this is
2044 required.
2045
2046 """
2047 # Need to set default selection where no item was indicated as being
2048 # selected by the HTML:
2049
2050 # CHECKBOX:
2051 # Nothing should be selected.
2052 # SELECT/single, SELECT/multiple and RADIO:
2053 # RFC 1866 (HTML 2.0): says first item should be selected.
2054 # W3C HTML 4.01 Specification: says that client behaviour is
2055 # undefined in this case. For RADIO, exactly one must be selected,
2056 # though which one is undefined.
2057 # Both Netscape and Microsoft Internet Explorer (IE) choose first
2058 # item for SELECT/single. However, both IE5 and Mozilla (both 1.0
2059 # and Firebird 0.6) leave all items unselected for RADIO and
2060 # SELECT/multiple.
2061
2062 # Since both Netscape and IE all choose the first item for
2063 # SELECT/single, we do the same. OTOH, both Netscape and IE
2064 # leave SELECT/multiple with nothing selected, in violation of RFC 1866
2065 # (but not in violation of the W3C HTML 4 standard); the same is true
2066 # of RADIO (which *is* in violation of the HTML 4 standard). We follow
2067 # RFC 1866 if the _select_default attribute is set, and Netscape and IE
2068 # otherwise. RFC 1866 and HTML 4 are always violated insofar as you
2069 # can deselect all items in a RadioControl.
2070
2071 for o in self.items:
2072 # set items' controls to self, now that we've merged
2073 o.__dict__["_control"] = self
2074
2075 def __getattr__(self, name):
2076 if name == "value":
2077 compat = self._form.backwards_compat
2078 if self.name is None:
2079 return []
2080 return [o.name for o in self.items if o.selected and
2081 (not o.disabled or compat)]
2082 else:
2083 raise AttributeError("%s instance has no attribute '%s'" %
2084 (self.__class__.__name__, name))
2085
2086 def __setattr__(self, name, value):
2087 if name == "value":
2088 if self.disabled:
2089 raise AttributeError("control '%s' is disabled" % self.name)
2090 if self.readonly:
2091 raise AttributeError("control '%s' is readonly" % self.name)
2092 self._set_value(value)
2093 elif name in ("name", "type", "multiple"):
2094 raise AttributeError("%s attribute is readonly" % name)
2095 else:
2096 self.__dict__[name] = value
2097
2098 def _set_value(self, value):
2099 if value is None or isstringlike(value):
2100 raise TypeError("ListControl, must set a sequence")
2101 if not value:
2102 compat = self._form.backwards_compat
2103 for o in self.items:
2104 if not o.disabled or compat:
2105 o.selected = False
2106 elif self.multiple:
2107 self._multiple_set_value(value)
2108 elif len(value) > 1:
2109 raise ItemCountError(
2110 "single selection list, must set sequence of "
2111 "length 0 or 1")
2112 else:
2113 self._single_set_value(value)
2114
2115 def _get_items(self, name, target=1):
2116 all_items = self.get_items(name)
2117 items = [o for o in all_items if not o.disabled]
2118 if len(items) < target:
2119 if len(all_items) < target:
2120 raise ItemNotFoundError(
2121 "insufficient items with name %r" % name)
2122 else:
2123 raise AttributeError(
2124 "insufficient non-disabled items with name %s" % name)
2125 on = []
2126 off = []
2127 for o in items:
2128 if o.selected:
2129 on.append(o)
2130 else:
2131 off.append(o)
2132 return on, off
2133
2134 def _single_set_value(self, value):
2135 assert len(value) == 1
2136 on, off = self._get_items(value[0])
2137 assert len(on) <= 1
2138 if not on:
2139 off[0].selected = True
2140
2141 def _multiple_set_value(self, value):
2142 compat = self._form.backwards_compat
2143 turn_on = [] # transactional-ish
2144 turn_off = [item for item in self.items if
2145 item.selected and (not item.disabled or compat)]
2146 names = {}
2147 for nn in value:
2148 if nn in names.keys():
2149 names[nn] += 1
2150 else:
2151 names[nn] = 1
2152 for name, count in names.items():
2153 on, off = self._get_items(name, count)
2154 for i in range(count):
2155 if on:
2156 item = on[0]
2157 del on[0]
2158 del turn_off[turn_off.index(item)]
2159 else:
2160 item = off[0]
2161 del off[0]
2162 turn_on.append(item)
2163 for item in turn_off:
2164 item.selected = False
2165 for item in turn_on:
2166 item.selected = True
2167
2168 def set_value_by_label(self, value):
2169 """Set the value of control by item labels.
2170
2171 value is expected to be an iterable of strings that are substrings of
2172 the item labels that should be selected. Before substring matching is
2173 performed, the original label text is whitespace-compressed
2174 (consecutive whitespace characters are converted to a single space
2175 character) and leading and trailing whitespace is stripped. Ambiguous
2176 labels are accepted without complaint if the form's backwards_compat is
2177 True; otherwise, it will not complain as long as all ambiguous labels
2178 share the same item name (e.g. OPTION value).
2179
2180 """
2181 if isstringlike(value):
2182 raise TypeError(value)
2183 if not self.multiple and len(value) > 1:
2184 raise ItemCountError(
2185 "single selection list, must set sequence of "
2186 "length 0 or 1")
2187 items = []
2188 for nn in value:
2189 found = self.get_items(label=nn)
2190 if len(found) > 1:
2191 if not self._form.backwards_compat:
2192 # ambiguous labels are fine as long as item names (e.g.
2193 # OPTION values) are same
2194 opt_name = found[0].name
2195 if [o for o in found[1:] if o.name != opt_name]:
2196 raise AmbiguityError(nn)
2197 else:
2198 # OK, we'll guess :-( Assume first available item.
2199 found = found[:1]
2200 for o in found:
2201 # For the multiple-item case, we could try to be smarter,
2202 # saving them up and trying to resolve, but that's too much.
2203 if self._form.backwards_compat or o not in items:
2204 items.append(o)
2205 break
2206 else: # all of them are used
2207 raise ItemNotFoundError(nn)
2208 # now we have all the items that should be on
2209 # let's just turn everything off and then back on.
2210 self.value = []
2211 for o in items:
2212 o.selected = True
2213
2214 def get_value_by_label(self):
2215 """Return the value of the control as given by normalized labels."""
2216 res = []
2217 compat = self._form.backwards_compat
2218 for o in self.items:
2219 if (not o.disabled or compat) and o.selected:
2220 for l in o.get_labels():
2221 if l.text:
2222 res.append(l.text)
2223 break
2224 else:
2225 res.append(None)
2226 return res
2227
2228 def possible_items(self, by_label=False):
2229 """Deprecated: return the names or labels of all possible items.
2230
2231 Includes disabled items, which may be misleading for some use cases.
2232
2233 """
2234 deprecation(
2235 "[item.name for item in self.items]")
2236 if by_label:
2237 res = []
2238 for o in self.items:
2239 for l in o.get_labels():
2240 if l.text:
2241 res.append(l.text)
2242 break
2243 else:
2244 res.append(None)
2245 return res
2246 return [o.name for o in self.items]
2247
2248 def _totally_ordered_pairs(self):
2249 if self.disabled or self.name is None:
2250 return []
2251 else:
2252 return [(o._index, self.name, o.name) for o in self.items
2253 if o.selected and not o.disabled]
2254
2255 def __str__(self):
2256 name = self.name
2257 if name is None: name = "<None>"
2258
2259 display = [str(o) for o in self.items]
2260
2261 infos = []
2262 if self.disabled: infos.append("disabled")
2263 if self.readonly: infos.append("readonly")
2264 info = ", ".join(infos)
2265 if info: info = " (%s)" % info
2266
2267 return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
2268 name, ", ".join(display), info)
2269
2270
2271class RadioControl(ListControl):
2272 """
2273 Covers:
2274
2275 INPUT/RADIO
2276
2277 """
2278 def __init__(self, type, name, attrs, select_default=False, index=None):
2279 attrs.setdefault("value", "on")
2280 ListControl.__init__(self, type, name, attrs, select_default,
2281 called_as_base_class=True, index=index)
2282 self.__dict__["multiple"] = False
2283 o = Item(self, attrs, index)
2284 o.__dict__["_selected"] = attrs.has_key("checked")
2285
2286 def fixup(self):
2287 ListControl.fixup(self)
2288 found = [o for o in self.items if o.selected and not o.disabled]
2289 if not found:
2290 if self._select_default:
2291 for o in self.items:
2292 if not o.disabled:
2293 o.selected = True
2294 break
2295 else:
2296 # Ensure only one item selected. Choose the last one,
2297 # following IE and Firefox.
2298 for o in found[:-1]:
2299 o.selected = False
2300
2301 def get_labels(self):
2302 return []
2303
2304class CheckboxControl(ListControl):
2305 """
2306 Covers:
2307
2308 INPUT/CHECKBOX
2309
2310 """
2311 def __init__(self, type, name, attrs, select_default=False, index=None):
2312 attrs.setdefault("value", "on")
2313 ListControl.__init__(self, type, name, attrs, select_default,
2314 called_as_base_class=True, index=index)
2315 self.__dict__["multiple"] = True
2316 o = Item(self, attrs, index)
2317 o.__dict__["_selected"] = attrs.has_key("checked")
2318
2319 def get_labels(self):
2320 return []
2321
2322
2323class SelectControl(ListControl):
2324 """
2325 Covers:
2326
2327 SELECT (and OPTION)
2328
2329
2330 OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
2331
2332 SELECT control values and labels are subject to some messy defaulting
2333 rules. For example, if the HTML representation of the control is:
2334
2335 <SELECT name=year>
2336 <OPTION value=0 label="2002">current year</OPTION>
2337 <OPTION value=1>2001</OPTION>
2338 <OPTION>2000</OPTION>
2339 </SELECT>
2340
2341 The items, in order, have labels "2002", "2001" and "2000", whereas their
2342 names (the OPTION values) are "0", "1" and "2000" respectively. Note that
2343 the value of the last OPTION in this example defaults to its contents, as
2344 specified by RFC 1866, as do the labels of the second and third OPTIONs.
2345
2346 The OPTION labels are sometimes more meaningful than the OPTION values,
2347 which can make for more maintainable code.
2348
2349 Additional read-only public attribute: attrs
2350
2351 The attrs attribute is a dictionary of the original HTML attributes of the
2352 SELECT element. Other ListControls do not have this attribute, because in
2353 other cases the control as a whole does not correspond to any single HTML
2354 element. control.get(...).attrs may be used as usual to get at the HTML
2355 attributes of the HTML elements corresponding to individual list items (for
2356 SELECT controls, these are OPTION elements).
2357
2358 Another special case is that the Item.attrs dictionaries have a special key
2359 "contents" which does not correspond to any real HTML attribute, but rather
2360 contains the contents of the OPTION element:
2361
2362 <OPTION>this bit</OPTION>
2363
2364 """
2365 # HTML attributes here are treated slightly differently from other list
2366 # controls:
2367 # -The SELECT HTML attributes dictionary is stuffed into the OPTION
2368 # HTML attributes dictionary under the "__select" key.
2369 # -The content of each OPTION element is stored under the special
2370 # "contents" key of the dictionary.
2371 # After all this, the dictionary is passed to the SelectControl constructor
2372 # as the attrs argument, as usual. However:
2373 # -The first SelectControl constructed when building up a SELECT control
2374 # has a constructor attrs argument containing only the __select key -- so
2375 # this SelectControl represents an empty SELECT control.
2376 # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
2377 # the __select dictionary containing the SELECT HTML-attributes.
2378
2379 def __init__(self, type, name, attrs, select_default=False, index=None):
2380 # fish out the SELECT HTML attributes from the OPTION HTML attributes
2381 # dictionary
2382 self.attrs = attrs["__select"].copy()
2383 self.__dict__["_label"] = _get_label(self.attrs)
2384 self.__dict__["id"] = self.attrs.get("id")
2385 self.__dict__["multiple"] = self.attrs.has_key("multiple")
2386 # the majority of the contents, label, and value dance already happened
2387 contents = attrs.get("contents")
2388 attrs = attrs.copy()
2389 del attrs["__select"]
2390
2391 ListControl.__init__(self, type, name, self.attrs, select_default,
2392 called_as_base_class=True, index=index)
2393 self.disabled = self.attrs.has_key("disabled")
2394 self.readonly = self.attrs.has_key("readonly")
2395 if attrs.has_key("value"):
2396 # otherwise it is a marker 'select started' token
2397 o = Item(self, attrs, index)
2398 o.__dict__["_selected"] = attrs.has_key("selected")
2399 # add 'label' label and contents label, if different. If both are
2400 # provided, the 'label' label is used for display in HTML
2401 # 4.0-compliant browsers (and any lower spec? not sure) while the
2402 # contents are used for display in older or less-compliant
2403 # browsers. We make label objects for both, if the values are
2404 # different.
2405 label = attrs.get("label")
2406 if label:
2407 o._labels.append(Label({"__text": label}))
2408 if contents and contents != label:
2409 o._labels.append(Label({"__text": contents}))
2410 elif contents:
2411 o._labels.append(Label({"__text": contents}))
2412
2413 def fixup(self):
2414 ListControl.fixup(self)
2415 # Firefox doesn't exclude disabled items from those considered here
2416 # (i.e. from 'found', for both branches of the if below). Note that
2417 # IE6 doesn't support the disabled attribute on OPTIONs at all.
2418 found = [o for o in self.items if o.selected]
2419 if not found:
2420 if not self.multiple or self._select_default:
2421 for o in self.items:
2422 if not o.disabled:
2423 was_disabled = self.disabled
2424 self.disabled = False
2425 try:
2426 o.selected = True
2427 finally:
2428 o.disabled = was_disabled
2429 break
2430 elif not self.multiple:
2431 # Ensure only one item selected. Choose the last one,
2432 # following IE and Firefox.
2433 for o in found[:-1]:
2434 o.selected = False
2435
2436
2437#---------------------------------------------------
2438class SubmitControl(ScalarControl):
2439 """
2440 Covers:
2441
2442 INPUT/SUBMIT
2443 BUTTON/SUBMIT
2444
2445 """
2446 def __init__(self, type, name, attrs, index=None):
2447 ScalarControl.__init__(self, type, name, attrs, index)
2448 # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
2449 # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
2450 # to define this.
2451 if self.value is None: self.value = ""
2452 self.readonly = True
2453
2454 def get_labels(self):
2455 res = []
2456 if self.value:
2457 res.append(Label({"__text": self.value}))
2458 res.extend(ScalarControl.get_labels(self))
2459 return res
2460
2461 def is_of_kind(self, kind): return kind == "clickable"
2462
2463 def _click(self, form, coord, return_type, request_class=urllib2.Request):
2464 self._clicked = coord
2465 r = form._switch_click(return_type, request_class)
2466 self._clicked = False
2467 return r
2468
2469 def _totally_ordered_pairs(self):
2470 if not self._clicked:
2471 return []
2472 return ScalarControl._totally_ordered_pairs(self)
2473
2474
2475#---------------------------------------------------
2476class ImageControl(SubmitControl):
2477 """
2478 Covers:
2479
2480 INPUT/IMAGE
2481
2482 Coordinates are specified using one of the HTMLForm.click* methods.
2483
2484 """
2485 def __init__(self, type, name, attrs, index=None):
2486 SubmitControl.__init__(self, type, name, attrs, index)
2487 self.readonly = False
2488
2489 def _totally_ordered_pairs(self):
2490 clicked = self._clicked
2491 if self.disabled or not clicked:
2492 return []
2493 name = self.name
2494 if name is None: return []
2495 pairs = [
2496 (self._index, "%s.x" % name, str(clicked[0])),
2497 (self._index + 1, "%s.y" % name, str(clicked[1])),
2498 ]
2499 value = self._value
2500 if value:
2501 pairs.append((self._index + 2, name, value))
2502 return pairs
2503
2504 get_labels = ScalarControl.get_labels
2505
2506# aliases, just to make str(control) and str(form) clearer
2507class PasswordControl(TextControl): pass
2508class HiddenControl(TextControl): pass
2509class TextareaControl(TextControl): pass
2510class SubmitButtonControl(SubmitControl): pass
2511
2512
2513def is_listcontrol(control): return control.is_of_kind("list")
2514
2515
2516class HTMLForm:
2517 """Represents a single HTML <form> ... </form> element.
2518
2519 A form consists of a sequence of controls that usually have names, and
2520 which can take on various values. The values of the various types of
2521 controls represent variously: text, zero-or-one-of-many or many-of-many
2522 choices, and files to be uploaded. Some controls can be clicked on to
2523 submit the form, and clickable controls' values sometimes include the
2524 coordinates of the click.
2525
2526 Forms can be filled in with data to be returned to the server, and then
2527 submitted, using the click method to generate a request object suitable for
2528 passing to urllib2.urlopen (or the click_request_data or click_pairs
2529 methods if you're not using urllib2).
2530
2531 import ClientForm
2532 forms = ClientForm.ParseFile(html, base_uri)
2533 form = forms[0]
2534
2535 form["query"] = "Python"
2536 form.find_control("nr_results").get("lots").selected = True
2537
2538 response = urllib2.urlopen(form.click())
2539
2540 Usually, HTMLForm instances are not created directly. Instead, the
2541 ParseFile or ParseResponse factory functions are used. If you do construct
2542 HTMLForm objects yourself, however, note that an HTMLForm instance is only
2543 properly initialised after the fixup method has been called (ParseFile and
2544 ParseResponse do this for you). See ListControl.__doc__ for the reason
2545 this is required.
2546
2547 Indexing a form (form["control_name"]) returns the named Control's value
2548 attribute. Assignment to a form index (form["control_name"] = something)
2549 is equivalent to assignment to the named Control's value attribute. If you
2550 need to be more specific than just supplying the control's name, use the
2551 set_value and get_value methods.
2552
2553 ListControl values are lists of item names (specifically, the names of the
2554 items that are selected and not disabled, and hence are "successful" -- ie.
2555 cause data to be returned to the server). The list item's name is the
2556 value of the corresponding HTML element's"value" attribute.
2557
2558 Example:
2559
2560 <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
2561 <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
2562
2563 defines a CHECKBOX control with name "cheeses" which has two items, named
2564 "leicester" and "cheddar".
2565
2566 Another example:
2567
2568 <SELECT name="more_cheeses">
2569 <OPTION>1</OPTION>
2570 <OPTION value="2" label="CHEDDAR">cheddar</OPTION>
2571 </SELECT>
2572
2573 defines a SELECT control with name "more_cheeses" which has two items,
2574 named "1" and "2" (because the OPTION element's value HTML attribute
2575 defaults to the element contents -- see SelectControl.__doc__ for more on
2576 these defaulting rules).
2577
2578 To select, deselect or otherwise manipulate individual list items, use the
2579 HTMLForm.find_control() and ListControl.get() methods. To set the whole
2580 value, do as for any other control: use indexing or the set_/get_value
2581 methods.
2582
2583 Example:
2584
2585 # select *only* the item named "cheddar"
2586 form["cheeses"] = ["cheddar"]
2587 # select "cheddar", leave other items unaffected
2588 form.find_control("cheeses").get("cheddar").selected = True
2589
2590 Some controls (RADIO and SELECT without the multiple attribute) can only
2591 have zero or one items selected at a time. Some controls (CHECKBOX and
2592 SELECT with the multiple attribute) can have multiple items selected at a
2593 time. To set the whole value of a ListControl, assign a sequence to a form
2594 index:
2595
2596 form["cheeses"] = ["cheddar", "leicester"]
2597
2598 If the ListControl is not multiple-selection, the assigned list must be of
2599 length one.
2600
2601 To check if a control has an item, if an item is selected, or if an item is
2602 successful (selected and not disabled), respectively:
2603
2604 "cheddar" in [item.name for item in form.find_control("cheeses").items]
2605 "cheddar" in [item.name for item in form.find_control("cheeses").items and
2606 item.selected]
2607 "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
2608
2609 Note that some list items may be disabled (see below).
2610
2611 Note the following mistake:
2612
2613 form[control_name] = control_value
2614 assert form[control_name] == control_value # not necessarily true
2615
2616 The reason for this is that form[control_name] always gives the list items
2617 in the order they were listed in the HTML.
2618
2619 List items (hence list values, too) can be referred to in terms of list
2620 item labels rather than list item names using the appropriate label
2621 arguments. Note that each item may have several labels.
2622
2623 The question of default values of OPTION contents, labels and values is
2624 somewhat complicated: see SelectControl.__doc__ and
2625 ListControl.get_item_attrs.__doc__ if you think you need to know.
2626
2627 Controls can be disabled or readonly. In either case, the control's value
2628 cannot be changed until you clear those flags (see example below).
2629 Disabled is the state typically represented by browsers by 'greying out' a
2630 control. Disabled controls are not 'successful' -- they don't cause data
2631 to get returned to the server. Readonly controls usually appear in
2632 browsers as read-only text boxes. Readonly controls are successful. List
2633 items can also be disabled. Attempts to select or deselect disabled items
2634 fail with AttributeError.
2635
2636 If a lot of controls are readonly, it can be useful to do this:
2637
2638 form.set_all_readonly(False)
2639
2640 To clear a control's value attribute, so that it is not successful (until a
2641 value is subsequently set):
2642
2643 form.clear("cheeses")
2644
2645 More examples:
2646
2647 control = form.find_control("cheeses")
2648 control.disabled = False
2649 control.readonly = False
2650 control.get("gruyere").disabled = True
2651 control.items[0].selected = True
2652
2653 See the various Control classes for further documentation. Many methods
2654 take name, type, kind, id, label and nr arguments to specify the control to
2655 be operated on: see HTMLForm.find_control.__doc__.
2656
2657 ControlNotFoundError (subclass of ValueError) is raised if the specified
2658 control can't be found. This includes occasions where a non-ListControl
2659 is found, but the method (set, for example) requires a ListControl.
2660 ItemNotFoundError (subclass of ValueError) is raised if a list item can't
2661 be found. ItemCountError (subclass of ValueError) is raised if an attempt
2662 is made to select more than one item and the control doesn't allow that, or
2663 set/get_single are called and the control contains more than one item.
2664 AttributeError is raised if a control or item is readonly or disabled and
2665 an attempt is made to alter its value.
2666
2667 Security note: Remember that any passwords you store in HTMLForm instances
2668 will be saved to disk in the clear if you pickle them (directly or
2669 indirectly). The simplest solution to this is to avoid pickling HTMLForm
2670 objects. You could also pickle before filling in any password, or just set
2671 the password to "" before pickling.
2672
2673
2674 Public attributes:
2675
2676 action: full (absolute URI) form action
2677 method: "GET" or "POST"
2678 enctype: form transfer encoding MIME type
2679 name: name of form (None if no name was specified)
2680 attrs: dictionary mapping original HTML form attributes to their values
2681
2682 controls: list of Control instances; do not alter this list
2683 (instead, call form.new_control to make a Control and add it to the
2684 form, or control.add_to_form if you already have a Control instance)
2685
2686
2687
2688 Methods for form filling:
2689 -------------------------
2690
2691 Most of the these methods have very similar arguments. See
2692 HTMLForm.find_control.__doc__ for details of the name, type, kind, label
2693 and nr arguments.
2694
2695 def find_control(self,
2696 name=None, type=None, kind=None, id=None, predicate=None,
2697 nr=None, label=None)
2698
2699 get_value(name=None, type=None, kind=None, id=None, nr=None,
2700 by_label=False, # by_label is deprecated
2701 label=None)
2702 set_value(value,
2703 name=None, type=None, kind=None, id=None, nr=None,
2704 by_label=False, # by_label is deprecated
2705 label=None)
2706
2707 clear_all()
2708 clear(name=None, type=None, kind=None, id=None, nr=None, label=None)
2709
2710 set_all_readonly(readonly)
2711
2712
2713 Method applying only to FileControls:
2714
2715 add_file(file_object,
2716 content_type="application/octet-stream", filename=None,
2717 name=None, id=None, nr=None, label=None)
2718
2719
2720 Methods applying only to clickable controls:
2721
2722 click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2723 click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),
2724 label=None)
2725 click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2726
2727 """
2728
2729 type2class = {
2730 "text": TextControl,
2731 "password": PasswordControl,
2732 "hidden": HiddenControl,
2733 "textarea": TextareaControl,
2734
2735 "isindex": IsindexControl,
2736
2737 "file": FileControl,
2738
2739 "button": IgnoreControl,
2740 "buttonbutton": IgnoreControl,
2741 "reset": IgnoreControl,
2742 "resetbutton": IgnoreControl,
2743
2744 "submit": SubmitControl,
2745 "submitbutton": SubmitButtonControl,
2746 "image": ImageControl,
2747
2748 "radio": RadioControl,
2749 "checkbox": CheckboxControl,
2750 "select": SelectControl,
2751 }
2752
2753#---------------------------------------------------
2754# Initialisation. Use ParseResponse / ParseFile instead.
2755
2756 def __init__(self, action, method="GET",
2757 enctype="application/x-www-form-urlencoded",
2758 name=None, attrs=None,
2759 request_class=urllib2.Request,
2760 forms=None, labels=None, id_to_labels=None,
2761 backwards_compat=True):
2762 """
2763 In the usual case, use ParseResponse (or ParseFile) to create new
2764 HTMLForm objects.
2765
2766 action: full (absolute URI) form action
2767 method: "GET" or "POST"
2768 enctype: form transfer encoding MIME type
2769 name: name of form
2770 attrs: dictionary mapping original HTML form attributes to their values
2771
2772 """
2773 self.action = action
2774 self.method = method
2775 self.enctype = enctype
2776 self.name = name
2777 if attrs is not None:
2778 self.attrs = attrs.copy()
2779 else:
2780 self.attrs = {}
2781 self.controls = []
2782 self._request_class = request_class
2783
2784 # these attributes are used by zope.testbrowser
2785 self._forms = forms # this is a semi-public API!
2786 self._labels = labels # this is a semi-public API!
2787 self._id_to_labels = id_to_labels # this is a semi-public API!
2788
2789 self.backwards_compat = backwards_compat # note __setattr__
2790
2791 self._urlunparse = urlparse.urlunparse
2792 self._urlparse = urlparse.urlparse
2793
2794 def __getattr__(self, name):
2795 if name == "backwards_compat":
2796 return self._backwards_compat
2797 return getattr(HTMLForm, name)
2798
2799 def __setattr__(self, name, value):
2800 # yuck
2801 if name == "backwards_compat":
2802 name = "_backwards_compat"
2803 value = bool(value)
2804 for cc in self.controls:
2805 try:
2806 items = cc.items
2807 except AttributeError:
2808 continue
2809 else:
2810 for ii in items:
2811 for ll in ii.get_labels():
2812 ll._backwards_compat = value
2813 self.__dict__[name] = value
2814
2815 def new_control(self, type, name, attrs,
2816 ignore_unknown=False, select_default=False, index=None):
2817 """Adds a new control to the form.
2818
2819 This is usually called by ParseFile and ParseResponse. Don't call it
2820 youself unless you're building your own Control instances.
2821
2822 Note that controls representing lists of items are built up from
2823 controls holding only a single list item. See ListControl.__doc__ for
2824 further information.
2825
2826 type: type of control (see Control.__doc__ for a list)
2827 attrs: HTML attributes of control
2828 ignore_unknown: if true, use a dummy Control instance for controls of
2829 unknown type; otherwise, use a TextControl
2830 select_default: for RADIO and multiple-selection SELECT controls, pick
2831 the first item as the default if no 'selected' HTML attribute is
2832 present (this defaulting happens when the HTMLForm.fixup method is
2833 called)
2834 index: index of corresponding element in HTML (see
2835 MoreFormTests.test_interspersed_controls for motivation)
2836
2837 """
2838 type = type.lower()
2839 klass = self.type2class.get(type)
2840 if klass is None:
2841 if ignore_unknown:
2842 klass = IgnoreControl
2843 else:
2844 klass = TextControl
2845
2846 a = attrs.copy()
2847 if issubclass(klass, ListControl):
2848 control = klass(type, name, a, select_default, index)
2849 else:
2850 control = klass(type, name, a, index)
2851
2852 if type == "select" and len(attrs) == 1:
2853 for ii in range(len(self.controls) - 1, -1, -1):
2854 ctl = self.controls[ii]
2855 if ctl.type == "select":
2856 ctl.close_control()
2857 break
2858
2859 control.add_to_form(self)
2860 control._urlparse = self._urlparse
2861 control._urlunparse = self._urlunparse
2862
2863 def fixup(self):
2864 """Normalise form after all controls have been added.
2865
2866 This is usually called by ParseFile and ParseResponse. Don't call it
2867 youself unless you're building your own Control instances.
2868
2869 This method should only be called once, after all controls have been
2870 added to the form.
2871
2872 """
2873 for control in self.controls:
2874 control.fixup()
2875 self.backwards_compat = self._backwards_compat
2876
2877#---------------------------------------------------
2878 def __str__(self):
2879 header = "%s%s %s %s" % (
2880 (self.name and self.name + " " or ""),
2881 self.method, self.action, self.enctype)
2882 rep = [header]
2883 for control in self.controls:
2884 rep.append(" %s" % str(control))
2885 return "<%s>" % "\n".join(rep)
2886
2887#---------------------------------------------------
2888# Form-filling methods.
2889
2890 def __getitem__(self, name):
2891 return self.find_control(name).value
2892 def __contains__(self, name):
2893 return bool(self.find_control(name))
2894 def __setitem__(self, name, value):
2895 control = self.find_control(name)
2896 try:
2897 control.value = value
2898 except AttributeError, e:
2899 raise ValueError(str(e))
2900
2901 def get_value(self,
2902 name=None, type=None, kind=None, id=None, nr=None,
2903 by_label=False, # by_label is deprecated
2904 label=None):
2905 """Return value of control.
2906
2907 If only name and value arguments are supplied, equivalent to
2908
2909 form[name]
2910
2911 """
2912 if by_label:
2913 deprecation("form.get_value_by_label(...)")
2914 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2915 if by_label:
2916 try:
2917 meth = c.get_value_by_label
2918 except AttributeError:
2919 raise NotImplementedError(
2920 "control '%s' does not yet support by_label" % c.name)
2921 else:
2922 return meth()
2923 else:
2924 return c.value
2925 def set_value(self, value,
2926 name=None, type=None, kind=None, id=None, nr=None,
2927 by_label=False, # by_label is deprecated
2928 label=None):
2929 """Set value of control.
2930
2931 If only name and value arguments are supplied, equivalent to
2932
2933 form[name] = value
2934
2935 """
2936 if by_label:
2937 deprecation("form.get_value_by_label(...)")
2938 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2939 if by_label:
2940 try:
2941 meth = c.set_value_by_label
2942 except AttributeError:
2943 raise NotImplementedError(
2944 "control '%s' does not yet support by_label" % c.name)
2945 else:
2946 meth(value)
2947 else:
2948 c.value = value
2949 def get_value_by_label(
2950 self, name=None, type=None, kind=None, id=None, label=None, nr=None):
2951 """
2952
2953 All arguments should be passed by name.
2954
2955 """
2956 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2957 return c.get_value_by_label()
2958
2959 def set_value_by_label(
2960 self, value,
2961 name=None, type=None, kind=None, id=None, label=None, nr=None):
2962 """
2963
2964 All arguments should be passed by name.
2965
2966 """
2967 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2968 c.set_value_by_label(value)
2969
2970 def set_all_readonly(self, readonly):
2971 for control in self.controls:
2972 control.readonly = bool(readonly)
2973
2974 def clear_all(self):
2975 """Clear the value attributes of all controls in the form.
2976
2977 See HTMLForm.clear.__doc__.
2978
2979 """
2980 for control in self.controls:
2981 control.clear()
2982
2983 def clear(self,
2984 name=None, type=None, kind=None, id=None, nr=None, label=None):
2985 """Clear the value attribute of a control.
2986
2987 As a result, the affected control will not be successful until a value
2988 is subsequently set. AttributeError is raised on readonly controls.
2989
2990 """
2991 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2992 c.clear()
2993
2994
2995#---------------------------------------------------
2996# Form-filling methods applying only to ListControls.
2997
2998 def possible_items(self, # deprecated
2999 name=None, type=None, kind=None, id=None,
3000 nr=None, by_label=False, label=None):
3001 """Return a list of all values that the specified control can take."""
3002 c = self._find_list_control(name, type, kind, id, label, nr)
3003 return c.possible_items(by_label)
3004
3005 def set(self, selected, item_name, # deprecated
3006 name=None, type=None, kind=None, id=None, nr=None,
3007 by_label=False, label=None):
3008 """Select / deselect named list item.
3009
3010 selected: boolean selected state
3011
3012 """
3013 self._find_list_control(name, type, kind, id, label, nr).set(
3014 selected, item_name, by_label)
3015 def toggle(self, item_name, # deprecated
3016 name=None, type=None, kind=None, id=None, nr=None,
3017 by_label=False, label=None):
3018 """Toggle selected state of named list item."""
3019 self._find_list_control(name, type, kind, id, label, nr).toggle(
3020 item_name, by_label)
3021
3022 def set_single(self, selected, # deprecated
3023 name=None, type=None, kind=None, id=None,
3024 nr=None, by_label=None, label=None):
3025 """Select / deselect list item in a control having only one item.
3026
3027 If the control has multiple list items, ItemCountError is raised.
3028
3029 This is just a convenience method, so you don't need to know the item's
3030 name -- the item name in these single-item controls is usually
3031 something meaningless like "1" or "on".
3032
3033 For example, if a checkbox has a single item named "on", the following
3034 two calls are equivalent:
3035
3036 control.toggle("on")
3037 control.toggle_single()
3038
3039 """ # by_label ignored and deprecated
3040 self._find_list_control(
3041 name, type, kind, id, label, nr).set_single(selected)
3042 def toggle_single(self, name=None, type=None, kind=None, id=None,
3043 nr=None, by_label=None, label=None): # deprecated
3044 """Toggle selected state of list item in control having only one item.
3045
3046 The rest is as for HTMLForm.set_single.__doc__.
3047
3048 """ # by_label ignored and deprecated
3049 self._find_list_control(name, type, kind, id, label, nr).toggle_single()
3050
3051#---------------------------------------------------
3052# Form-filling method applying only to FileControls.
3053
3054 def add_file(self, file_object, content_type=None, filename=None,
3055 name=None, id=None, nr=None, label=None):
3056 """Add a file to be uploaded.
3057
3058 file_object: file-like object (with read method) from which to read
3059 data to upload
3060 content_type: MIME content type of data to upload
3061 filename: filename to pass to server
3062
3063 If filename is None, no filename is sent to the server.
3064
3065 If content_type is None, the content type is guessed based on the
3066 filename and the data from read from the file object.
3067
3068 XXX
3069 At the moment, guessed content type is always application/octet-stream.
3070 Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
3071 plain text.
3072
3073 Note the following useful HTML attributes of file upload controls (see
3074 HTML 4.01 spec, section 17):
3075
3076 accept: comma-separated list of content types that the server will
3077 handle correctly; you can use this to filter out non-conforming files
3078 size: XXX IIRC, this is indicative of whether form wants multiple or
3079 single files
3080 maxlength: XXX hint of max content length in bytes?
3081
3082 """
3083 self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
3084 file_object, content_type, filename)
3085
3086#---------------------------------------------------
3087# Form submission methods, applying only to clickable controls.
3088
3089 def click(self, name=None, type=None, id=None, nr=0, coord=(1, 1),
3090 request_class=urllib2.Request,
3091 label=None):
3092 """Return request that would result from clicking on a control.
3093
3094 The request object is a urllib2.Request instance, which you can pass to
3095 urllib2.urlopen (or ClientCookie.urlopen).
3096
3097 Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
3098 IMAGEs) can be clicked.
3099
3100 Will click on the first clickable control, subject to the name, type
3101 and nr arguments (as for find_control). If no name, type, id or number
3102 is specified and there are no clickable controls, a request will be
3103 returned for the form in its current, un-clicked, state.
3104
3105 IndexError is raised if any of name, type, id or nr is specified but no
3106 matching control is found. ValueError is raised if the HTMLForm has an
3107 enctype attribute that is not recognised.
3108
3109 You can optionally specify a coordinate to click at, which only makes a
3110 difference if you clicked on an image.
3111
3112 """
3113 return self._click(name, type, id, label, nr, coord, "request",
3114 self._request_class)
3115
3116 def click_request_data(self,
3117 name=None, type=None, id=None,
3118 nr=0, coord=(1, 1),
3119 request_class=urllib2.Request,
3120 label=None):
3121 """As for click method, but return a tuple (url, data, headers).
3122
3123 You can use this data to send a request to the server. This is useful
3124 if you're using httplib or urllib rather than urllib2. Otherwise, use
3125 the click method.
3126
3127 # Untested. Have to subclass to add headers, I think -- so use urllib2
3128 # instead!
3129 import urllib
3130 url, data, hdrs = form.click_request_data()
3131 r = urllib.urlopen(url, data)
3132
3133 # Untested. I don't know of any reason to use httplib -- you can get
3134 # just as much control with urllib2.
3135 import httplib, urlparse
3136 url, data, hdrs = form.click_request_data()
3137 tup = urlparse(url)
3138 host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
3139 conn = httplib.HTTPConnection(host)
3140 if data:
3141 httplib.request("POST", path, data, hdrs)
3142 else:
3143 httplib.request("GET", path, headers=hdrs)
3144 r = conn.getresponse()
3145
3146 """
3147 return self._click(name, type, id, label, nr, coord, "request_data",
3148 self._request_class)
3149
3150 def click_pairs(self, name=None, type=None, id=None,
3151 nr=0, coord=(1, 1),
3152 label=None):
3153 """As for click_request_data, but returns a list of (key, value) pairs.
3154
3155 You can use this list as an argument to ClientForm.urlencode. This is
3156 usually only useful if you're using httplib or urllib rather than
3157 urllib2 or ClientCookie. It may also be useful if you want to manually
3158 tweak the keys and/or values, but this should not be necessary.
3159 Otherwise, use the click method.
3160
3161 Note that this method is only useful for forms of MIME type
3162 x-www-form-urlencoded. In particular, it does not return the
3163 information required for file upload. If you need file upload and are
3164 not using urllib2, use click_request_data.
3165
3166 Also note that Python 2.0's urllib.urlencode is slightly broken: it
3167 only accepts a mapping, not a sequence of pairs, as an argument. This
3168 messes up any ordering in the argument. Use ClientForm.urlencode
3169 instead.
3170
3171 """
3172 return self._click(name, type, id, label, nr, coord, "pairs",
3173 self._request_class)
3174
3175#---------------------------------------------------
3176
3177 def find_control(self,
3178 name=None, type=None, kind=None, id=None,
3179 predicate=None, nr=None,
3180 label=None):
3181 """Locate and return some specific control within the form.
3182
3183 At least one of the name, type, kind, predicate and nr arguments must
3184 be supplied. If no matching control is found, ControlNotFoundError is
3185 raised.
3186
3187 If name is specified, then the control must have the indicated name.
3188
3189 If type is specified then the control must have the specified type (in
3190 addition to the types possible for <input> HTML tags: "text",
3191 "password", "hidden", "submit", "image", "button", "radio", "checkbox",
3192 "file" we also have "reset", "buttonbutton", "submitbutton",
3193 "resetbutton", "textarea", "select" and "isindex").
3194
3195 If kind is specified, then the control must fall into the specified
3196 group, each of which satisfies a particular interface. The types are
3197 "text", "list", "multilist", "singlelist", "clickable" and "file".
3198
3199 If id is specified, then the control must have the indicated id.
3200
3201 If predicate is specified, then the control must match that function.
3202 The predicate function is passed the control as its single argument,
3203 and should return a boolean value indicating whether the control
3204 matched.
3205
3206 nr, if supplied, is the sequence number of the control (where 0 is the
3207 first). Note that control 0 is the first control matching all the
3208 other arguments (if supplied); it is not necessarily the first control
3209 in the form. If no nr is supplied, AmbiguityError is raised if
3210 multiple controls match the other arguments (unless the
3211 .backwards-compat attribute is true).
3212
3213 If label is specified, then the control must have this label. Note
3214 that radio controls and checkboxes never have labels: their items do.
3215
3216 """
3217 if ((name is None) and (type is None) and (kind is None) and
3218 (id is None) and (label is None) and (predicate is None) and
3219 (nr is None)):
3220 raise ValueError(
3221 "at least one argument must be supplied to specify control")
3222 return self._find_control(name, type, kind, id, label, predicate, nr)
3223
3224#---------------------------------------------------
3225# Private methods.
3226
3227 def _find_list_control(self,
3228 name=None, type=None, kind=None, id=None,
3229 label=None, nr=None):
3230 if ((name is None) and (type is None) and (kind is None) and
3231 (id is None) and (label is None) and (nr is None)):
3232 raise ValueError(
3233 "at least one argument must be supplied to specify control")
3234
3235 return self._find_control(name, type, kind, id, label,
3236 is_listcontrol, nr)
3237
3238 def _find_control(self, name, type, kind, id, label, predicate, nr):
3239 if ((name is not None) and (name is not Missing) and
3240 not isstringlike(name)):
3241 raise TypeError("control name must be string-like")
3242 if (type is not None) and not isstringlike(type):
3243 raise TypeError("control type must be string-like")
3244 if (kind is not None) and not isstringlike(kind):
3245 raise TypeError("control kind must be string-like")
3246 if (id is not None) and not isstringlike(id):
3247 raise TypeError("control id must be string-like")
3248 if (label is not None) and not isstringlike(label):
3249 raise TypeError("control label must be string-like")
3250 if (predicate is not None) and not callable(predicate):
3251 raise TypeError("control predicate must be callable")
3252 if (nr is not None) and nr < 0:
3253 raise ValueError("control number must be a positive integer")
3254
3255 orig_nr = nr
3256 found = None
3257 ambiguous = False
3258 if nr is None and self.backwards_compat:
3259 nr = 0
3260
3261 for control in self.controls:
3262 if ((name is not None and name != control.name) and
3263 (name is not Missing or control.name is not None)):
3264 continue
3265 if type is not None and type != control.type:
3266 continue
3267 if kind is not None and not control.is_of_kind(kind):
3268 continue
3269 if id is not None and id != control.id:
3270 continue
3271 if predicate and not predicate(control):
3272 continue
3273 if label:
3274 for l in control.get_labels():
3275 if l.text.find(label) > -1:
3276 break
3277 else:
3278 continue
3279 if nr is not None:
3280 if nr == 0:
3281 return control # early exit: unambiguous due to nr
3282 nr -= 1
3283 continue
3284 if found:
3285 ambiguous = True
3286 break
3287 found = control
3288
3289 if found and not ambiguous:
3290 return found
3291
3292 description = []
3293 if name is not None: description.append("name %s" % repr(name))
3294 if type is not None: description.append("type '%s'" % type)
3295 if kind is not None: description.append("kind '%s'" % kind)
3296 if id is not None: description.append("id '%s'" % id)
3297 if label is not None: description.append("label '%s'" % label)
3298 if predicate is not None:
3299 description.append("predicate %s" % predicate)
3300 if orig_nr: description.append("nr %d" % orig_nr)
3301 description = ", ".join(description)
3302
3303 if ambiguous:
3304 raise AmbiguityError("more than one control matching " + description)
3305 elif not found:
3306 raise ControlNotFoundError("no control matching " + description)
3307 assert False
3308
3309 def _click(self, name, type, id, label, nr, coord, return_type,
3310 request_class=urllib2.Request):
3311 try:
3312 control = self._find_control(
3313 name, type, "clickable", id, label, None, nr)
3314 except ControlNotFoundError:
3315 if ((name is not None) or (type is not None) or (id is not None) or
3316 (nr != 0)):
3317 raise
3318 # no clickable controls, but no control was explicitly requested,
3319 # so return state without clicking any control
3320 return self._switch_click(return_type, request_class)
3321 else:
3322 return control._click(self, coord, return_type, request_class)
3323
3324 def _pairs(self):
3325 """Return sequence of (key, value) pairs suitable for urlencoding."""
3326 return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
3327
3328
3329 def _pairs_and_controls(self):
3330 """Return sequence of (index, key, value, control_index)
3331 of totally ordered pairs suitable for urlencoding.
3332
3333 control_index is the index of the control in self.controls
3334 """
3335 pairs = []
3336 for control_index in range(len(self.controls)):
3337 control = self.controls[control_index]
3338 for ii, key, val in control._totally_ordered_pairs():
3339 pairs.append((ii, key, val, control_index))
3340
3341 # stable sort by ONLY first item in tuple
3342 pairs.sort()
3343
3344 return pairs
3345
3346 def _request_data(self):
3347 """Return a tuple (url, data, headers)."""
3348 method = self.method.upper()
3349 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action)
3350 parts = self._urlparse(self.action)
3351 rest, (query, frag) = parts[:-2], parts[-2:]
3352
3353 if method == "GET":
3354 if self.enctype != "application/x-www-form-urlencoded":
3355 raise ValueError(
3356 "unknown GET form encoding type '%s'" % self.enctype)
3357 parts = rest + (urlencode(self._pairs()), None)
3358 uri = self._urlunparse(parts)
3359 return uri, None, []
3360 elif method == "POST":
3361 parts = rest + (query, None)
3362 uri = self._urlunparse(parts)
3363 if self.enctype == "application/x-www-form-urlencoded":
3364 return (uri, urlencode(self._pairs()),
3365 [("Content-Type", self.enctype)])
3366 elif self.enctype == "multipart/form-data":
3367 data = StringIO()
3368 http_hdrs = []
3369 mw = MimeWriter(data, http_hdrs)
3370 f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
3371 prefix=0)
3372 for ii, k, v, control_index in self._pairs_and_controls():
3373 self.controls[control_index]._write_mime_data(mw, k, v)
3374 mw.lastpart()
3375 return uri, data.getvalue(), http_hdrs
3376 else:
3377 raise ValueError(
3378 "unknown POST form encoding type '%s'" % self.enctype)
3379 else:
3380 raise ValueError("Unknown method '%s'" % method)
3381
3382 def _switch_click(self, return_type, request_class=urllib2.Request):
3383 # This is called by HTMLForm and clickable Controls to hide switching
3384 # on return_type.
3385 if return_type == "pairs":
3386 return self._pairs()
3387 elif return_type == "request_data":
3388 return self._request_data()
3389 else:
3390 req_data = self._request_data()
3391 req = request_class(req_data[0], req_data[1])
3392 for key, val in req_data[2]:
3393 add_hdr = req.add_header
3394 if key.lower() == "content-type":
3395 try:
3396 add_hdr = req.add_unredirected_header
3397 except AttributeError:
3398 # pre-2.4 and not using ClientCookie
3399 pass
3400 add_hdr(key, val)
3401 return req
Note: See TracBrowser for help on using the repository browser.