Copy Code code as follows:
#-*-Coding:utf-8-*-
'''
Created on 2013-12-5
@author: Good-temper
'''
Import Urllib2
Import BS4
Import time
def getpage (URLSTR):
'''
Get page Content
'''
Content = Urllib2.urlopen (urlstr). Read ()
return content
def getnextpageurl (Currpagenum):
#http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-page number -1-1-72-4137-33.html
url = U ' http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-' +str (currpagenum+1) + ' -1-1-72-4137-33.html '
#是否有下一页
Content = getpage (URL);
Soup = bs4. BeautifulSoup (content)
List = Soup.findall (' span ', {' class ': ' next-disabled '});
if (len (list) = 0):
Return URL
Return ""
Def analyzelist ():
Pagenum = 0
list = []
url = getnextpageurl (pagenum)
While URL!= ':
Soup = bs4. BeautifulSoup (getpage (URL))
PageList = Soup.findall (' div ', {' class ': ' P-name '})
For Elem in PageList:
Soup1 = BS4. BeautifulSoup (str (elem))
List.append (Soup1.find (' a ') [' href '])
Pagenum = pagenum+1
Print Pagenum
url = getnextpageurl (pagenum)
Return list
def analyzecontent (URL):
Return ""
def writetofile (list, path):
f = open (path, ' a ')
For Elem in list:
F.write (elem+ ' \ n ')
F.close ()
if __name__ = = ' __main__ ':
List = Analyzelist ()
print ' Total crawl ' +str (len (list) + ' article \ n '
WriteToFile (list, U ' E:\\jd_phone_list.dat ');