spider.py
1 #-*-coding:utf-8-*-2 fromUrllibImportUrlEncode3 ImportRequests4 fromRequests.exceptionsImportrequestexception5 ImportJSON6 ImportRe7 ImportOS8 fromHashlibImportMD59 fromBs4ImportBeautifulSoupTen ImportPymongo One fromMultiprocessingImportPool A fromJson.decoderImportJsondecoder - fromConfigImport* - theClient = Pymongo. Mongoclient (Mongo_url, connect=False) -db =client[mongo_db] - - defGet_page_index (Offset,keyword): +data = { - 'Offset': Offset, + 'format':'JSON', A 'keyword': keyword, at 'AutoLoad':'true', - 'Count':' -', - 'Cur_tab': 3 - } -URL ='http://www.toutiao.com/search_content/?'+urlencode (data) - Try: inResponse =requests.get (URL) - ifResponse.status_code = = 200: to returnResponse.text + returnNone - exceptrequestexception: the PrintU'failed to request index page', the URL * returnNone $ Panax Notoginseng defParse_page_index (HTML): -data =json.loads (HTML) the ifData and 'Data' inchData.keys (): + forIteminchData.get ('Data'): A yieldItem.get ('Article_url') the + defget_page_detail (URL): - Try: $Response =requests.get (URL) $ ifResponse.status_code = = 200: - returnResponse.text - returnNone the exceptrequestexception: - PrintU'Request Details page failed', the URLWuyi returnNone the - defparse_page_detail (HTML, URL): WuSoup = BeautifulSoup (HTML,'lxml') -title = Soup.select ('title') [0].get_text () About Print(title) $Images_pattern = Re.compile ('Gallery: (. *?), \ n', Re. S) -result =Re.search (Images_pattern, HTML) - ifResult: -data = Json.loads (Result.group (1)) A ifData and 'sub_images' inchData.keys (): +Sub_images = Data.get ('sub_images') theImages = [Item.get ('URL') forIteminchSub_images] - forImageinchimages:download_image (image) $ return { the 'title': Title, the 'URL': URL, the 'Images': Images the } - in defSave_to_mongo (Result): the ifDb[mongo_table].insert (Result): the PrintU'Store to MongoDB success', result About returnTrue the returnFalse the the defdownload_image (URL): + PrintU'is downloading', the URL - Try: theResponse =requests.get (URL)Bayi ifResponse.status_code = = 200: the save_image (response.content) the returnNone - exceptrequestexception: - PrintU'Request picture failed', the URL the returnNone the the defsave_image (content): theFile_path ='{0}/{1}. {2}'. Format (OS.GETCWD (), MD5 (content). Hexdigest (),'jpg') - if notos.path.exists (file_path): theWith open (File_path,'WB') as F: the f.write (content) the f.close ()94 the defMain (offset): theHTML =Get_page_index (offset, KEYWORD) the forUrlinchParse_page_index (HTML):98HTML =get_page_detail (URL) About ifHTML: -result =parse_page_detail (HTML, URL)101 ifResult:save_to_mongo (Result)102 103 if __name__=='__main__':104groups = [x*20 forXinchRange (Group_start, group_end+1)] thePool =Pool ()106Pool.map (Main, groups)
View Code
config.py
1 #-*-coding:utf-8-*-2Mongo_url ='localhost'3mongo_db ='Toutiao'4Mongo_table ='Toutiao'5 6Group_start =07Group_end = 208 9KEYWORD ='Street Pat'
View Code
Analytics Ajax captures today's headline Street Photo gallery