Import Pycurl,stringio,json,time,re,sys,urllib2 fromlxml Import etree# headers= {# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",# "accept-encoding":"gzip, deflate, SDCH",# "Accept-language":"zh-cn,zh;q=0.8",# "Cache-control":"max-age=0",# "Connection":"keep-alive",# "Cookies":"hm_lvt_fa633287999535c3e5f5a63e82308549=1462868485; hm_lpvt_fa633287999535c3e5f5a63e82308549=1462868485; cnzzdata5838747=cnzz_eid%3d1693591872-1459152412-http%253a%252f%252fwww.1396app.com%252f%26ntime%3d1462865237 ",# "Host":"www.1396app.com",# "upgrade-insecure-requests":"1",# "user-agent":"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.112 safari/537.36", #}reload (SYS) sys.setdefaultencoding ('Utf-8') def gethtml (url,headers): C=Pycurl. Curl () #通过curl方法构造一个对象 #c. setopt (Pycurl. REFERER,'http://qy.m.58.com/') #设置referer c.setopt (Pycurl. Followlocation, True) #自动进行跳转抓取 c.setopt (Pycurl. Maxredirs,5) #设置最多跳转多少次 c.setopt (Pycurl. ConnectTimeout, -) #设置链接超时 c.setopt (Pycurl. TIMEOUT, -) #下载超时 c.setopt (Pycurl. ENCODING,'gzip,deflate'#处理gzip内容, some stupid website, even if you give a request without gzip, it will return a gzip compressed page # c.setopt (C.PROXY,IP) # proxy C.FP=Stringio.stringio () c.setopt (Pycurl. URL, url) #设置要访问的URL # c.setopt (pycurl. httpheader,headers) #传入请求头 c.setopt (c.writefunction, C.fp.write) #回调写入字符串缓存 c.perform () code
=c.getinfo (c.http_code) #返回状态码 HTML=c.fp.getvalue () #返回源代码returnCodewrite_key= Open ('Key.txt','A +') forList_urlinchRange0,441): URL='http://www.icaile.com/tag/gl-45-%s.html'%List_url forKeyinchRe.findall (R'title= "(. *?)"', gethtml (URL)): Key= Key.decode ('Utf-8','Ignore') Write_key.write (Key+'\ n') Print key
Headers add not all can, proposal is add a bar, finally this paragraph with Re.findall extract title= "" content is also technology not in place
Using XPath will be more accurate ... For records only
Url_range = etree. HTML (gethtml (URL). Decode ('utf-8','ignore')) = Url_range.xpath ('/html/body/div[3]/div[3]/div[1]/div[1]/div/ul/li[1]/a') [ 0 ].text Print Dateil
The writing was rather coarse, and did not go heavy
About SEO can have what use, I feel catch the key words more cool ... Change it as a locomotive, more efficient than a train, write a multi-threaded word (said Pycurl is also with multi-threading)
Extract the article title for all lists on a Web page