First, crawl process
Second, the Code demonstration
#-*-coding:utf-8-*-#_author: alexcthon#mail:[email protected] #date: 2018/8/3import requestsfrom Multiprocessing Import Pool # process pools, used to implement seconds capture from requests.exceptions import requestexceptionimport reimport jsondef Get_one_ Page (URL): try:response = requests.get (URL) #print (response.text) if (Response.status_code = = 200) : Return response.text return None except Requestexception:return nonedef parse_one_page (HTML ): pattern = Re.compile (' <dd>.*?board-index.*?> (\d+) </i>.*?data-src= "(. *?)". *?name "><a.*?> (. *?) </a>.*?star "> (. *?) </p>.*?releasetime "> (. *?) </p>.*?integer "> (. *?) </i>.*?fraction "> (. *?) </i>.*?</dd> ', Re. S) items = Re.findall (pattern,html) for item in items:yield{' index ': item[0], ' image ': Item[1], ' title ': item[2], ' actor ': Item[3].strip () [3:], ' time ': Item[4].strip () [5:], ' Score ': item[5]+item[6]}def write_to_file (content): With open (' Result.txt ', ' a ', encoding= ' Utf-8 ') as F:f.write (JSON.D Umps (content,ensure_ascii=false) + ' \ n ') f.close () def main (offset): url = ' http://maoyan.com/board/4?offset= ' +st R (offset) HTML = get_n=get_one_page (URL) for item in Parse_one_page (HTML): Print (item) Write_to_file (i TEM) If __name__ = = ' __main__ ': Pool = Pool () Pool.map (main,[i*10 for I in range (10)])
Python Crawler Knowledge points Summary (ix) requests+ regular expression crawling cat's-eye movie