source: py-scraping/mechanize/_util.py@ 173

Last change on this file since 173 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 8.6 KB
Line 
1"""Utility functions and date/time routines.
2
3 Copyright 2002-2006 John J Lee <jjl@pobox.com>
4
5This code is free software; you can redistribute it and/or modify it
6under the terms of the BSD or ZPL 2.1 licenses (see the file
7COPYING.txt included with the distribution).
8
9"""
10
11import re, time, warnings
12
13
14class ExperimentalWarning(UserWarning):
15 pass
16
17def experimental(message):
18 warnings.warn(message, ExperimentalWarning, stacklevel=3)
19def hide_experimental_warnings():
20 warnings.filterwarnings("ignore", category=ExperimentalWarning)
21def reset_experimental_warnings():
22 warnings.filterwarnings("default", category=ExperimentalWarning)
23
24def deprecation(message):
25 warnings.warn(message, DeprecationWarning, stacklevel=3)
26def hide_deprecations():
27 warnings.filterwarnings("ignore", category=DeprecationWarning)
28def reset_deprecations():
29 warnings.filterwarnings("default", category=DeprecationWarning)
30
31
32def isstringlike(x):
33 try: x + ""
34 except: return False
35 else: return True
36
37## def caller():
38## try:
39## raise SyntaxError
40## except:
41## import sys
42## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
43
44
45from calendar import timegm
46
47# Date/time conversion routines for formats used by the HTTP protocol.
48
49EPOCH = 1970
50def my_timegm(tt):
51 year, month, mday, hour, min, sec = tt[:6]
52 if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
53 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
54 return timegm(tt)
55 else:
56 return None
57
58days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
59months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
60 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
61months_lower = []
62for month in months: months_lower.append(month.lower())
63
64
65def time2isoz(t=None):
66 """Return a string representing time in seconds since epoch, t.
67
68 If the function is called without an argument, it will use the current
69 time.
70
71 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
72 representing Universal Time (UTC, aka GMT). An example of this format is:
73
74 1994-11-24 08:49:37Z
75
76 """
77 if t is None: t = time.time()
78 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
79 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
80 year, mon, mday, hour, min, sec)
81
82def time2netscape(t=None):
83 """Return a string representing time in seconds since epoch, t.
84
85 If the function is called without an argument, it will use the current
86 time.
87
88 The format of the returned string is like this:
89
90 Wed, DD-Mon-YYYY HH:MM:SS GMT
91
92 """
93 if t is None: t = time.time()
94 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
95 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
96 days[wday], mday, months[mon - 1], year, hour, min, sec)
97
98
99UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
100
101timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
102def offset_from_tz_string(tz):
103 offset = None
104 if UTC_ZONES.has_key(tz):
105 offset = 0
106 else:
107 m = timezone_re.search(tz)
108 if m:
109 offset = 3600 * int(m.group(2))
110 if m.group(3):
111 offset = offset + 60 * int(m.group(3))
112 if m.group(1) == '-':
113 offset = -offset
114 return offset
115
116def _str2time(day, mon, yr, hr, min, sec, tz):
117 # translate month name to number
118 # month numbers start with 1 (January)
119 try:
120 mon = months_lower.index(mon.lower()) + 1
121 except ValueError:
122 # maybe it's already a number
123 try:
124 imon = int(mon)
125 except ValueError:
126 return None
127 if 1 <= imon <= 12:
128 mon = imon
129 else:
130 return None
131
132 # make sure clock elements are defined
133 if hr is None: hr = 0
134 if min is None: min = 0
135 if sec is None: sec = 0
136
137 yr = int(yr)
138 day = int(day)
139 hr = int(hr)
140 min = int(min)
141 sec = int(sec)
142
143 if yr < 1000:
144 # find "obvious" year
145 cur_yr = time.localtime(time.time())[0]
146 m = cur_yr % 100
147 tmp = yr
148 yr = yr + cur_yr - m
149 m = m - tmp
150 if abs(m) > 50:
151 if m > 0: yr = yr + 100
152 else: yr = yr - 100
153
154 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
155 t = my_timegm((yr, mon, day, hr, min, sec, tz))
156
157 if t is not None:
158 # adjust time using timezone string, to get absolute time since epoch
159 if tz is None:
160 tz = "UTC"
161 tz = tz.upper()
162 offset = offset_from_tz_string(tz)
163 if offset is None:
164 return None
165 t = t - offset
166
167 return t
168
169
170strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
171 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
172wkday_re = re.compile(
173 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
174loose_http_re = re.compile(
175 r"""^
176 (\d\d?) # day
177 (?:\s+|[-\/])
178 (\w+) # month
179 (?:\s+|[-\/])
180 (\d+) # year
181 (?:
182 (?:\s+|:) # separator before clock
183 (\d\d?):(\d\d) # hour:min
184 (?::(\d\d))? # optional seconds
185 )? # optional clock
186 \s*
187 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
188 \s*
189 (?:\(\w+\))? # ASCII representation of timezone in parens.
190 \s*$""", re.X)
191def http2time(text):
192 """Returns time in seconds since epoch of time represented by a string.
193
194 Return value is an integer.
195
196 None is returned if the format of str is unrecognized, the time is outside
197 the representable range, or the timezone string is not recognized. If the
198 string contains no timezone, UTC is assumed.
199
200 The timezone in the string may be numerical (like "-0800" or "+0100") or a
201 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
202 timezone strings equivalent to UTC (zero offset) are known to the function.
203
204 The function loosely parses the following formats:
205
206 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
207 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
208 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
209 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
210 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
211 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
212
213 The parser ignores leading and trailing whitespace. The time may be
214 absent.
215
216 If the year is given with only 2 digits, the function will select the
217 century that makes the year closest to the current date.
218
219 """
220 # fast exit for strictly conforming string
221 m = strict_re.search(text)
222 if m:
223 g = m.groups()
224 mon = months_lower.index(g[1].lower()) + 1
225 tt = (int(g[2]), mon, int(g[0]),
226 int(g[3]), int(g[4]), float(g[5]))
227 return my_timegm(tt)
228
229 # No, we need some messy parsing...
230
231 # clean up
232 text = text.lstrip()
233 text = wkday_re.sub("", text, 1) # Useless weekday
234
235 # tz is time zone specifier string
236 day, mon, yr, hr, min, sec, tz = [None] * 7
237
238 # loose regexp parse
239 m = loose_http_re.search(text)
240 if m is not None:
241 day, mon, yr, hr, min, sec, tz = m.groups()
242 else:
243 return None # bad format
244
245 return _str2time(day, mon, yr, hr, min, sec, tz)
246
247
248iso_re = re.compile(
249 """^
250 (\d{4}) # year
251 [-\/]?
252 (\d\d?) # numerical month
253 [-\/]?
254 (\d\d?) # day
255 (?:
256 (?:\s+|[-:Tt]) # separator before clock
257 (\d\d?):?(\d\d) # hour:min
258 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
259 )? # optional clock
260 \s*
261 ([-+]?\d\d?:?(:?\d\d)?
262 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
263 \s*$""", re.X)
264def iso2time(text):
265 """
266 As for http2time, but parses the ISO 8601 formats:
267
268 1994-02-03 14:15:29 -0100 -- ISO 8601 format
269 1994-02-03 14:15:29 -- zone is optional
270 1994-02-03 -- only date
271 1994-02-03T14:15:29 -- Use T as separator
272 19940203T141529Z -- ISO 8601 compact format
273 19940203 -- only date
274
275 """
276 # clean up
277 text = text.lstrip()
278
279 # tz is time zone specifier string
280 day, mon, yr, hr, min, sec, tz = [None] * 7
281
282 # loose regexp parse
283 m = iso_re.search(text)
284 if m is not None:
285 # XXX there's an extra bit of the timezone I'm ignoring here: is
286 # this the right thing to do?
287 yr, mon, day, hr, min, sec, tz, _ = m.groups()
288 else:
289 return None # bad format
290
291 return _str2time(day, mon, yr, hr, min, sec, tz)
Note: See TracBrowser for help on using the repository browser.