source: py-scraping/foo.py@ 156

Last change on this file since 156 was 106, checked in by Rick van der Zwet, 15 years ago

Initial commit...

File size: 1.2 KB
RevLine 
[106]1
2
3import json
4import re
5import mechanize
6import sys
7import pprint
8
9cj = mechanize.LWPCookieJar()
10cj.load("mycookies.txt",ignore_discard=True, ignore_expires=True)
11#pprint.pprint(cj._cookies)
12
13br = mechanize.Browser()
14br.set_cookiejar(cj)
15
16#br.open('https://login.facebook.com/login.php')
17#br.select_form(nr=0)
18#br["email"] = "rickvanderzwet@gmail.com"
19#br["pass"] = "PASSWORD"
20#response1 = br.submit()
21#cj.save("mycookies.txt",ignore_discard=True, ignore_expires=True)
22#pprint.pprint(cj._cookies)
23
24#response2 = br.open('http://www.facebook.com/ajax/intent.php?filter=lf&__a=1')
25
26#f = open('json.foo', 'w')
27#f.write(response2.read())
28#f.close()
29#json.loads(response2.read())
30
31f = open('json.foo', 'r')
32foo = f.read()
33foo = foo[9:]
34#pprint.pprint(foo)
35obj = json.loads(foo)
36
37from xml.dom.minidom import parse, parseString
38s = obj["payload"]["html"]
39s = s.encode("utf-8")
40pprint.pprint(s)
41import tidy
42s = tidy.parseString(s, indent=1, tidy_mark=0,doctype="omit").__str__()
43print s.split()[14:20]
44doc = parseString(s)
45print doc.getElementsByTagName("a")
46# br.select_form(name='login_form')
47#url = 'https://login.facebook.com/login.php?login_attempt=1'
48#values = {'email' : 'rickvanderzwet@gmail.com',
49#'pass' : 'M3nF00b2r!F'}
Note: See TracBrowser for help on using the repository browser.