Crawl Shuangse Qiu Historical data
Crawl Address: http://baidu.lecai.com/lottery/draw/list/50
Required Package: BeautifulSoup Install
The script is generated after the data JSON is processed to exist in the file
#-*-Coding:utf-8-*-
import urllib
import re
import JSON from
BS4 import beautifulsoup
li = []
For year in range (2003, 2015):
print year
Htmlcon = Urllib.urlopen (' http://baidu.lecai.com/lottery/draw/list/ 50?d=%s-01-01 '% year)
HTML = htmlcon.read ()
htmlcon.close ()
soup = beautifulsoup (html)
table_html _set = Soup.findall (id= ' draw_list ')
num_tuple_list = [] for
table_html in table_html_set:
tr_html_set = Table_html.findall (' tr ')
for tr_html in tr_html_set:
span_html_set = Tr_html.findall (' span ', attrs={' Class ': Re.compile (' ^ball_ ')})
num_tuple = tuple ([Int (x.text) for x in Span_html_set])
if num_tuple:
Num_tuple_list.append (num_tuple)
print "Count:%s"% len (num_tuple_list)
li.extend (num_tuple_list)
FL = open (' Data ', ' W ')
fl.write (Json.dumps (LI))
Fl.close ()
The script sort processing of data data is stored in Ticket.txt can be analyzed with a stop breakpoint every time you run a script in the province.
Import JSON
try: from
IPython import embed as stop
except Importerror: from
pdb import set_trace as Stop
FL = open (' data ')
Li_json = Fl.read ()
fl.close ()
li = Json.loads (li_json)
li = [tuple (x) for x in li]
li.sort (Lambda x,y:cmp (x,y))
fl = open (' Ticket.txt ', ' W ') for
item in Li: Line
= ",". Join ([STR (x) for X In item])
fl.writelines ("%s\n"% line)
Fl.close ()