#!/usr/bin/python#-*-coding:utf-8-*-import sysimport reimport urllib2from beautifulsoup import BeautifulSoupdef Searc H (Key): #请求搜索链接, keyword with parameter key instead of search_url= ' Http://www.baidu.com/s?ie=UTF-8&wd=key ' Req=urllib2.urlopen (search_ Url.replace (' key ', key)) #计数变量, used to record the number of pages Count = # main loop, crawl each page URL until the last page while 1:print "\033[1;31mpage%s:\033[0m"% Counthtml=req.read () Soup=beautifulsoup (HTML) f = open ("Result.txt", ' a ') #url在 <span>...</span>, content = Soup.findall (' span ', attrs={' class ': ' G '}) #对每一个对象解析for i in Content:pat = Re.compile ("^ (. +?) .*$ ") #i为对象, so convert to string URL with I.text = Re.search (pat,i.text) #url有可能匹配不到if url:f.write (Url.group (1) +" \ n ") pri NT Url.group (1) else:nextf.close () #得到 "next page" link. In addition to the first and last pages, the others will have 2 elements. The first one is the previous page, the second is the next page. #这里取倒数第一个元素 next_page= ' http://www.baidu.com ' +soup (' a ', {' href ': True, ' class ': ' N '}) [ -1][' href '] #最后一页只有一个元素, the countdown to the first Is "prev", so judge if there is only one element, and it is not the first page to end. #否则可能会造成死循环if Count >1 and Len (Soup (' a ', {' href ': True, ' class ': ' N '})) ==1:print "\033[1;31mcomplete!\033[0m "break# is not the last page to continue Else:req=urllib2.urlopen (next_page) Count + = 1if __name__== ' __main__ ': key =" Hello world! " Search (Key)
Python crawl search to the URL, small crawler