python crawler-beautiful soup-as book catalog (1)
For the first time, use Python + beautiful soup to crawl some data and use it as a book catalog as an exercise.
idea:
-1. Get when the book Category: Class Report name + link URL
Effect:
Script Description:
- 1. mylog.py: Log
- 2. getbookkindinfo.py: Book catalogue mylog.py
#!/usr/bin/env python #-*-coding:utf-8-*-' Created on 2018-4-10 @author: Administrator ' "' Import logging import get Pass Import SYS defines mylog class class MyLog (object): "" This class is used to create a personal log "" "Def __init__ (self): Self.user = Getpass.getuser () Self.logger = Logging.getlogger (self.user) self.logger.setLevel (logging. DEBUG) The log file name Self.logfile = sys.argv[0][0:-3] + '. Log ' #日志文件名 Self.formatter = logging. Formatter ('% (asctime) -12s% (levelname) -8s% (name) -10s% (message) -12s ') "" The log is displayed to the screen and output to the log folder "" Loghand=l Ogging. Filehandler (Self.logfile) loghand.setformatter (self.formatter) loghand.setlevel (logging. ERROR) #只有错误才会被记录到logfile中 loghandst=logging. Streamhandler () loghandst.setformatter (self.formatter) Self.logger.addHandler (Loghand) Self.logge The 5 levels of the R.addhandler (LOGHANDST) "" Log correspond to the following 5 functions "" "Def Debug (self,msg): Self.logger.debug (msg) def info(self,msg): Self.logger.info (MSG) def warn (self,msg): Self.logger.warn (MSG) def error (SELF,MSG):
Self.logger.error (MSG) def critical (self,msg): self.logger.critical (msg) If __name__ = ' __main__ ': Mylog=mylog () mylog.debug ("I am a Debug") Mylog.info ("I am an info") Mylog.warn ("I am a Warn") mylog.error ("I am a Error ") mylog.critical (" I am a Critical ")
getbookkindinfo.py
#!
/usr/bin/env python #-*-coding:utf-8-*-' Created on 2018-4-10 @author: Administrator gets when book type, big class name + Big class URL, small class name + small class URL
' Import re from BS4 import beautifulsoup import urllib2 from mylog import mylog as MyLog class Bookkinditem (object): ' Book kind ' Name=none # kind of name Url=none # kind of URL class Getbookkinditem (object): "Get when book type" ' Def getresponsecontent (self,url): Try:response=urllib2.urlopen (Url.encode (' UTF8 ')) exc Ept:self.log.error (U ' python returns url:%s data failed '%url) else:self.log.info (U ' pytho N returns url:%s a data success '%url ' return Response.read () def __init__ (self): self.urls=[] Self.log
=mylog () # self.geturls () def geturls (self): url=r ' Http://category.dangdang.com/?ref=www-0-C ' Htmlcontent=self.getresponsecontent (URL) soup=beautifulsoup (htmlcontent, ' lxml ', from_encoding= ' GBK ') #此处改为
UTF8 is not "whole book category" #大类 DL = [] #小类 XL = [] #outside---outer div #_li---li layer for Out Sidediv in Soup.find ("div", class_= "Classify_books", id= "floor_1"). Find_all ("div", class_= "Classify_kind"): #
Books large category Item_dl=bookkinditem ();
item_dl.name=outsidediv.div.a.string;
Item_dl.url=outsidediv.div.a.get ("href");
Dl.append (ITEM_DL);
# for E in DL: # print ('%s----%s '% (E.name, e.url));
# Books Small class for _li in Outsidediv.find ("ul"). Find_all ("Li"): if _li.a.string = = "more":
Continue Else:item_xl=bookkinditem ();
item_xl.name=_li.a.string;
Item_xl.url=_li.a.get ("href");
Xl.append (ITEM_XL);
# for E in XL: # print ('%s----%s '% (E.name, e.url)); Return DL, XL if __name__ = ' __main__ ': # url=u ' Http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B
8%b8%e6%88%8f&ie=utf-8&pn=50 ' # Gti=getbookkinditem () #首先获取相关链接从KindLinks Kls=getbookkinditem ()
#书籍的链接数据 Bdata=kls.geturls () print (' # # Book Big class ');
For E in bdata[0]: print ('%s----%s '% (E.name, e.url));
Print (' # # Book small class ');
For E in bdata[1]: print ('%s----%s '% (E.name, e.url));