Based on the python2.7 version, crawl Baidu mobile phone Assistant (http://shouji.baidu.com/software/) Web site app data. Process flow Chart of crawler
The crawler process flowchart is as follows: Created with Raphaël 2.1.0 Start analysis address structure Get app category page URL crawl app detail page URL crawl App Detail page data save crawl data to JSON file end Two, concrete steps 1. Parsing URL address structure
Baidu mobile phone after entering the website http://shouji.baidu.com/software/, you can see a total of 10 categories: Social communication, System tools, financial shopping and so on, you can know through these 10 categories to crawl app data, You should be able to fully crawl all the apps.
Randomly click on a category to enter after, such as Click Social Communication this big class, see URL is: http://shouji.baidu.com/software/503/, analysis can know these 10 categories are 501~ 510, the URL is shouji.baidu.com/software/plus a number in 501~510.
Each category page has 1-8 pages of apps, such as the second page of the social networking app, which is http://shouji.baidu.com/software/503/list_2.html, followed by the number 1-8 in the List_.
Through the above analysis, you can divide the crawled URL into the following three-part structure:
Self.base_url = ' http://shouji.baidu.com/software/'
self.category_num = [501, 502, 503, 504, 505, 506, 507, 508, 509, 510]
self.page_num = [1, 2, 3, 4, 5, 6, 7, 8]
where Category_num and page_num represent category numbers and paging numbers, respectively. 2. Get URL of all app category pages
Define array categorypageurl_list to store the URL after the category page is spliced:
def getappcategorypageurl (self):
#所有应用类别的URLlist
categorypageurl_list = [] for
x in Self.category_num: For
y in Self.page_num:
categorypageurl_list.append (Self.base_url + str (x) + '/list_ ' + str (y) + '. html ')
re Turn categorypageurl_list
3. Crawl all App details page URL
Crawl all app page URLs and store them in the appdetailpageurl_list array.
URL
def getappdetailpageurl (self) for #爬取所有应用 details page:
categorypageurl_list = Self.getappcategorypageurl ()
Appdetailpageurl_list = [] for
URL in categorypageurl_list:
#构造request请求对象
request = Urllib2. Request (URL)
response = Urllib2.urlopen (request)
content = Response.read (). Decode ("Unicode-escape")
# The RE module is used to support regular expressions, which can be understood as a matching pattern, re. s means "." You can match the newline "\ n" Pattern
= re.compile (' <div.*?app-box ' >.*?<a href= "(. *?)". *?> ', Re. S)
ResultStr = Re.findall (pattern, content) for result in
resultstr:
#print ' crawling ' +
result Appdetailpageurl = ' http://shouji.baidu.com/' + result
appdetailpageurl_list.append (appdetailpageurl)
Return appdetailpageurl_list
4. Crawl App Details page data
Crawl the contents of the app details page to get specific data for the app.
#爬取App详情页中的所需内容 def getappinfo (self, appurl): Try:request = Urllib2. Request (AppUrl) response = Urllib2.urlopen (request) except URLLIB2.
Urlerror, E:print "Get AppInfo failed:", E.reason return None content = Response.read (). Decode ("Utf-8") # Create Dict result = {} for Save results #得到app名字 pattern = re.compile (' <span> (. *?)
</span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Name ' = resultstr.group (1) # to get app size, we need to handle string pattern = Re.compile (' <span class= ' size ' > (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Size '] = (((Resultstr.group (1)) . Split (': ')) [1]). Strip () #版本 pattern = re.compile (' <span class= ' Version ' > (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Version ' = ((Resultstr.group ( 1). Split (': ')) [1]). Strip () #下载量 pattern = re.compile (' <span class= "Download-num" > (. *?) </SPAN> ') ResultStr = Re.search (pattern, content) if resultstr:result[' download-num '] = ((resultstr.group (1)). Split (': ')) [1]). Strip () #LOGO URL pattern = re.compile (' ') ResultStr = Re.search (pattern, content) if resultstr:result[' app-pic '] = Resultstr.group (1) #下 Address pattern = re.compile (' <div.*?area-download ' >.*?<a target= "_blank.*?href=" (. *?) ". *?> ', Re.
S) ResultStr = Re.search (pattern, content) if resultstr:result[' app-href ' = Resultstr.group (1) #详情页 result[' page-url ' = appurl #应用描述 pattern = re.compile (' <p.*?content content_hover ' > (. *?) <span.*?>.*?</span></p> ', Re.
S) ResultStr = Re.search (pattern, content) if resultstr:result[' description '] = Resultstr.group (1) Else: Pattern = Re.compile (' <div class=.*?brief-long ">.*?<p.*?content" > (. *?) </p>.*?</div> ', Re. S) ResultStr = Re.search (Pattern, ContenT) if resultstr:result[' description ' = Resultstr.group (1) #应用截图 pattern = re.compile (' <li> ; ', Re. S) ResultStr = Re.search (pattern, content) if resultstr:result[' screen-shot ' = Resultstr.group (1) #print Result return result
5. Save crawl data to a JSON file
In this method, all crawled data is saved to the Appdata.json file.
#将数据保存到json
def savedata (self, Resultinfo):
# Resultinfo convert to JSON data format save
Encodedjson = Json.dumps ( Resultinfo) with
open (' Appdata.json ', ' W ') as F:
F.write (Encodedjson)
print ' finished. '
I encapsulated all of the methods and parameters into the spider class, creating the beginning of the crawler, the overall code is as follows:
#-*-coding:utf-8-*-import urllib2 import re import json class Appsipder:def __init__ (self): #URL模式: http: Shouji.baidu.com/software/502/list_x.html, divided into three parts Self.base_url = ' http://shouji.baidu.com/software/' #类别
Number #self. category_num = [501, 502, 503, 504, 505, 506, 507, 508, 509, 510] Self.category_num = [501] #分页编号 #self. page_num = [1, 2, 3, 4, 5, 6, 7, 8] Self.page_num = [1] #获得所有应用 URL def get for the category page
Appcategorypageurl (self): #所有应用类别的URLlist categorypageurl_list = [] for x in Self.category_num: For y in Self.page_num:categoryPageURL_list.append (Self.base_url + str (x) + '/list_ ' + str (y) + '. html ') return categorypageurl_list #爬取所有应用 details page URL def getappdetailpageurl (self): Categorypag
Eurl_list = Self.getappcategorypageurl () appdetailpageurl_list = [] for URL in categorypageurl_list:
#构造request请求对象 Request = Urllib2. Request (URL) response = Urllib2.urlopen (request) content = Response.read (). Decode ("Unicode-escape" #re模块用于对正则表达式的支持, patterns can be understood as a matching pattern, re. s means "." You can match the newline "\ n" pattern = re.compile (' <div.*?app-box ' >.*?<a href= "(. *?)". *?> ', Re. S) ResultStr = Re.findall (pattern, content) for result in ResultStr: #print ' crawl ing ' + result appdetailpageurl = ' http://shouji.baidu.com/' + result Appdetailpageurl_lis
T.append (Appdetailpageurl) return appdetailpageurl_list #爬取App详情页中的所需内容 def getappinfo (self, AppUrl): Try:request = Urllib2. Request (AppUrl) response = Urllib2.urlopen (request) except URLLIB2. Urlerror, E:print "Get AppInfo failed:", E.reason return None content = Response.read (). D
Ecode ("Utf-8") # creates the dict result = {} of the saved results #得到app名字 Pattern = Re.compile (' <span> (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Name ' = ResultS Tr.group (1) # Gets the app size and needs to handle the string pattern = Re.compile (' <span class= ' size ' > (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Size '] = ((resu Ltstr.group (1)). Split (': ')] [1]). Strip () #版本 pattern = re.compile (' <span class= ' Version ' > (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' Version ' = ((r Esultstr.group (1)). Split (': ')] [1]). Strip () #下载量 pattern = re.compile (' <span class= ' Download-num ' > (. *?) </span> ') ResultStr = Re.search (pattern, content) if resultstr:result[' download-num '] = (((Resultstr.group (1)). Split (': ')) [1]). Strip () #LOGO URL pattern = re.compile (' ')
ResultStr = Re.search (pattern, content) if resultstr:result[' app-pic ' = resultstr.group (1) #下载地址 pattern = re.compile (' <div.*?area-download ">.*?<a target=" _blank.*?href= "(. *?)". *?> ', Re. S) ResultStr = Re.search (pattern, content) if resultstr:result[' app-href '] = Resultstr.group ( 1) #详情页 result[' page-url '] = appurl #应用描述 pattern = re.compile (' <p.*?content content_ Hover "> (. *?) <span.*?>.*?</span></p> ', Re. S) ResultStr = Re.search (pattern, content) if resultstr:result[' description '] = Resultstr.gro Up (1) Else:pattern = Re.compile (' <div class=.*?brief-long ">.*?<p.*?content" > (. *?) </p>.*?</div> ', Re. S) ResultStr = Re.search (pattern, content) if resultstr:result[' description '] = R Esultstr.group (1) #应用截图 pattern = Re.compiLe (' <li> ', Re. S) ResultStr = Re.search (pattern, content) if resultstr:result[' screen-shot '] = Resultstr.gro Up (1) #print #爬虫开始入口 def startspider (self): print ' Start crawling pl Ease wait ... ' appdetailpageurl_list = Self.getappdetailpageurl () Resultinfo = [] for URL in Appdet AilPageURL_list:resultInfo.append (self.getappinfo (URL)) print len (resultinfo), ' apps have been D. ' #resultInfo转换为json数据格式进行保存 Encodedjson = Json.dumps (Resultinfo) with open (' Appdata.json ', ' W ')
As F:f.write (Encodedjson) print ' finished. '
Spider = Appsipder () spider.startspider ()