[Python]
# Coding = gbk
Import OS
Import sys
Import re
Import time
Import urllib2
Def perror_and_exit (message, status =-1 ):
Sys. stderr. write (message + '\ n ')
Sys. exit (status)
Def get_text_from_html_tag (html ):
Pattern_text = re. compile (r "> .*? Return pattern_text.findall (html) [0] [1:-2]. strip ()
Def parse_alexa (url ):
Url_alexa = "http://icp.alexa.cn/index.php? Q = % s "% url
Print url_alexa
# Handle exception
Times = 0
While times <5000: # The number of waiting times is limited.
Try:
Alexa = urllib2.urlopen (url_alexa). read ()
Pattern_table = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_table = pattern_table.search (alexa)
If not match_table:
Raise BaseException ("No table in HTML ")
Break
Except t:
Print "try % s times: sleep % s seconds" % (times, 2 ** times)
Times + = 1
Time. sleep (2 ** times)
Continue
Table = match_table.group ()
Pattern_tr = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_tr = pattern_tr.findall (table)
If len (match_tr )! = 2:
Perror_and_exit ("table format is incorrect ")
Icp_tr = match_tr [1]
Pattern_td = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_td = pattern_td.findall (icp_tr)
# Print match_td
Company_name = get_text_from_html_tag (match_td [1])
Company_properties = get_text_from_html_tag (match_td [2])
Company_icp = get_text_from_html_tag (match_td [3])
Company_icp = company_icp [company_icp.find (">") + 1:]
Company_website_name = get_text_from_html_tag (match_td [4])
Company_website_home_page = get_text_from_html_tag (match_td [5])
Company_website_home_page = company_website_home_page [company_website_home_page.rfind (">") + 1:]
Company_detail_url = get_text_from_html_tag (match_td [7])
Pattern_href = re. compile (r "href = \".*? \ "", Re. DOTALL | re. MULTILINE)
Match_href = pattern_href.findall (company_detail_url)
If len (match_href) = 0:
Company_detail_url = ""
Else:
Company_detail_url = match_href [0] [len ("href = \" "):-1]
Return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]
Pass
If _ name _ = "_ main __":
Fw = file ("out.txt", "w ")
For url in sys. stdin:
Fw. write ("\ t". join (parse_alexa (url) + "\ n ")
# Coding = gbk
Import OS
Import sys
Import re
Import time
Import urllib2
Def perror_and_exit (message, status =-1 ):
Sys. stderr. write (message + '\ n ')
Sys. exit (status)
Def get_text_from_html_tag (html ):
Pattern_text = re. compile (r "> .*? Return pattern_text.findall (html) [0] [1:-2]. strip ()
Def parse_alexa (url ):
Url_alexa = "http://icp.alexa.cn/index.php? Q = % s "% url
Print url_alexa
# Handle exception
Times = 0
While times <5000: # The number of waiting times is limited.
Try:
Alexa = urllib2.urlopen (url_alexa). read ()
Pattern_table = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_table = pattern_table.search (alexa)
If not match_table:
Raise BaseException ("No table in HTML ")
Break
Except t:
Print "try % s times: sleep % s seconds" % (times, 2 ** times)
Times + = 1
Time. sleep (2 ** times)
Continue
Table = match_table.group ()
Pattern_tr = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_tr = pattern_tr.findall (table)
If len (match_tr )! = 2:
Perror_and_exit ("table format is incorrect ")
Icp_tr = match_tr [1]
Pattern_td = re. compile (r" .*? ", Re. DOTALL | re. MULTILINE)
Match_td = pattern_td.findall (icp_tr)
# Print match_td
Company_name = get_text_from_html_tag (match_td [1])
Company_properties = get_text_from_html_tag (match_td [2])
Company_icp = get_text_from_html_tag (match_td [3])
Company_icp = company_icp [company_icp.find (">") + 1:]
Company_website_name = get_text_from_html_tag (match_td [4])
Company_website_home_page = get_text_from_html_tag (match_td [5])
Company_website_home_page = company_website_home_page [company_website_home_page.rfind (">") + 1:]
Company_detail_url = get_text_from_html_tag (match_td [7])
Pattern_href = re. compile (r "href = \".*? \ "", Re. DOTALL | re. MULTILINE)
Match_href = pattern_href.findall (company_detail_url)
If len (match_href) = 0:
Company_detail_url = ""
Else:
Company_detail_url = match_href [0] [len ("href = \" "):-1]
Return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]
Pass
If _ name _ = "_ main __":
Fw = file ("out.txt", "w ")
For url in sys. stdin:
Fw. write ("\ t". join (parse_alexa (url) + "\ n") [python] view plaincopyprint? Time. sleep (2)
Pass
Time. sleep (2)
Pass
Each capture will sleep for 2 s to prevent the ip address from being blocked. In fact, even if the IP address is sleep for a while, it will still be blocked.
Because it is structured crawling, this program will not be available when the website format changes