Web of Science crawler Actual (POST method)
one. Overview
This crawler mainly through the title of the paper to retrieve the paper, so as to crawl the paper was cited, nearly 180 days download and download the total amount. This is a web of scienece core collection, and crawls using the Post method in the Python requests library, in addition to the multithreading technology two to speed up the crawl Speed version 2.0 . Website and crawl Strategy analysis
First click http://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID= 5bkzch5qilqiopn9wjv&preferencessaved=
This link goes to the appropriate page and presses F12 to the network page on the right, as shown in the following figure:
Figure 1: Retrieving pages
Then select the web of Science core collection in the right red box and the title button, and then enter the title I'm here with blackcarbon in soils from different land use areas of Shanghai, China:leve L,sources and relationship with polycyclic aromatic hydrocarbons as an example, the contents of the right red box in the figure below are the data that needs to be submitted using the Post method.
Figure 2.post Submitting data source
Click on the search to enter the page we need to crawl as follows
Figure 3. Crawl Page
See the right side of the page marked with red box mouse to move to the top right to check that class open the developer page, the specific analysis of the HTML inside
three. Crawler Code
Here is the complete Python code, as well as access to my GitHub Https://github.com/jgzquanquan/Spyder_wos
title_wos_1.0 version
Import re # from threading import Thread to multiprocessing import Process from multiprocessing import Manager import re Quests Import time Import xlrd to BS4 import beautifulsoup from lxml Import etree class Spidermain (object): Def __i
Nit__ (self, sid, kanming): Self.hearders = {' Origin ': ' https://apps.webofknowledge.com ', ' Referer ': ' Https://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch &sid=r1zsjrxofactqsl6uqh&preferencessaved= ', ' user-agent ': "mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.94 safari/537.36 ", ' content-type ': ' Application
/x-www-form-urlencoded '} self.form_data = {' FieldCount ': 1, ' action ': ' Search ', ' Product ': ' WOS ', ' search_mode ': ' Generalsearch ', ' Sid ': Sid, ' Max_field_c
Ount ': ' formupdated ': ' true ', ' Value (INPUT1) ': kanming, ' Value (Select1) ': ' TI ', ' Value (HIDINPUT1) ': ', ' Li Mitstatus ': ' Collapsed ', ' ss_lemmatization ': ' On ', ' ss_spellchecking ': ' Suggest ', ' Sin CELASTVISIT_UTC ': ', ' sincelastvisit_date ': ', ' period ': ' Range Selection ', ' range ': ' All ', ' StartYear ': ' 1982 ', ' Endyear ': ' 2017 ', ' update_back2search_link_param ': ' Yes '
, ' ssstatus ': ' Display:none ', ' ss_showsuggestions ': ' On ', ' ss_query_language ': ' Auto ', ' Ss_numdefaultgeneralsearchfields ': 1, ' rs_sort_by ': ' PY. D Ld. D So. A Vl. D Pg. A AU.
A '} self.form_data2 = {' Product ': ' WOS ', ' prev_search_mode ': ' Combinesearches ', ' Search_mode ': ' combinesearches ', ' Sid ': SID, ' action ': ' Remove ', ' Gotopag
Eloc ': ' Searchhistorytablebanner ', ' Currurl ': ' Https://apps.webofknowledge.com/WOS_CombineSearches_input.do? Sid= ' + sid + ' &product=wos&search_mode=combinesearches ', ' x ': +, ' y ': 9, ' DS ET ': 1} def craw (self, root_url,i): Try:s = requests. Session () R = S.post (Root_url, Data=self.form_data, headers=self.hearders) r.encoding = r.apparent _encoding tree = etree. HTML (r.text) cited = Tree.xpath ("//div[@class = ' search-results-data-cite ']/a/text ()") Download = tr
Ee.xpath (".//div[@class = ' Alum_text ']/span/text ()") flag = 0 print (i,cited, Download,r.url)
flag=0 return cited, download, flag except Exception as e:if i = = 0: Print (e) print (i) flag = 1 return cited, download, Flag def dele Te_history (self): Murl = ' https://apps.webofknowledge.coM/wos_combinesearches.do ' s = requests. Session () S.post (Murl, Data=self.form_data2, headers=self.hearders) Root_url = ' https://apps.webofknowledge.c Om/wos_generalsearch.do ' if __name__ = = "__main__": # sid= ' 6aylq8zfggvxdtactv9 ' root = ' http://www.webofknowledg e.com/' s = requests.get (root) sid = Re.findall (R ' sid=\w+& ', S.url) [0].replace (' sid= ', ']. Replace (' & ', ' data = Xlrd.open_workbook (' 2015 graduate published thesis. xlsx ') Table = Data.sheets () [2] #具体是取哪个表格 nrows = table.nrows NC OLS = Table.ncols CType = 1 XF = 0 for I in range (2, nrows): CSV = open (' 2015_3.csv ', ' a ') FAI
L = open (' Fail.txt ', ' a ') if I% 100 = 0: # replace SID s = Requests.get every 100 times (root) Sid = Re.findall (R ' sid=\w+& ', S.url) [0].replace (' sid= ', ']. Replace (' & ', ') kanming = Table.cell (i, 5). VA lue# the data Obj_spider = Spidermain (SID, Kanming) Cited,download,flag = Obj_spidEr.craw (Root_url,i) if Flag==1:fail.write (str (i) + ' \ n ') Else:if Len (cited) ==0: Cited.append (0) print (cited) if Len (download) ==0:download.appen D (0) download.append (0) print (download) csv.write (str (i) + ', ' + str (cited[0)) + ', ' + str (download[0]) + ', ' + str (download[1]) + ' \ n ') Csv.close ()
title_wos_2.0
Import re # from threading import Thread to multiprocessing import Process from multiprocessing import Manager import re Quests Import time import xlrd from BS4 import beautifulsoup from lxml Import etree class Spidermain (object): Def __
Init__ (self, sid, kanming): Self.hearders = {' Origin ': ' https://apps.webofknowledge.com ', ' Referer ': ' Https://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch &sid=r1zsjrxofactqsl6uqh&preferencessaved= ', ' user-agent ': "mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.94 safari/537.36 ", ' content-type ': ' Application
/x-www-form-urlencoded '} self.form_data = {' FieldCount ': 1, ' action ': ' Search ', ' Product ': ' WOS ', ' search_mode ': ' Generalsearch ', ' Sid ': Sid, ' Max_field_c
Ount ': ' formupdated ': ' true ', ' Value (INPUT1) ': kanming, ' Value (Select1) ': ' TI ', ' Value (HIDINPUT1) ': ', ' l Imitstatus ': ' Collapsed ', ' ss_lemmatization ': ' On ', ' ss_spellchecking ': ' Suggest ', ' Si NCELASTVISIT_UTC ': ', ' sincelastvisit_date ': ', ' period ': ' Range Selection ', ' range ' : ' All ', ' StartYear ': ' 1982 ', ' Endyear ': ' 2017 ', ' update_back2search_link_param ': ' Yes ', ' ssstatus ': ' Display:none ', ' ss_showsuggestions ': ' On ', ' ss_query_language ': ' Auto ' , ' Ss_numdefaultgeneralsearchfields ': 1, ' rs_sort_by ': ' PY. D Ld. D So. A Vl. D Pg. A AU.
A '} self.form_data2 = {' Product ': ' WOS ', ' prev_search_mode ': ' Combinesearches ', ' Search_mode ': ' combinesearches ', ' Sid ': SID, ' action ': ' Remove ', ' Gotopag
Eloc ': ' Searchhistorytablebanner ', ' Currurl ': ' Https://apps.webofknowledge.com/WOS_CombineSearches_input.do? Sid= ' + sid + ' &product=wos&search_mode=combinesearches ', ' x ': +, ' y ': 9, ' DS ET ': 1} def craw (self, root_url,i): Try:s = requests. Session () R = S.post (Root_url, Data=self.form_data, headers=self.hearders) r.encoding = r.apparent _encoding tree = etree. HTML (r.text) cited = Tree.xpath ("//div[@class = ' search-results-data-cite ']/a/text ()") Download = tr
Ee.xpath (".//div[@class = ' Alum_text ']/span/text ()") flag = 0 print (cited, download,r.url)
flag=0 return cited, download, flag except Exception as E: # error occurred, try again to increase the success rate of the result if i = = 0:print (e) print (i) flag = 1 return cited, Download, Flag def delete_history (self): Murl= ' https://apps.webofknowledge.com/WOS_CombineSearches.do ' s = requests. Session () S.post (Murl, Data=self.form_data2, Headers=self.hearders) class Mythread (Process): Def __init__ (sel F, Sid, Kanming, I, DIC): process.__init__ (self) self.row = i self.sid = Sid self.kanming = Kanming Self.dic=dic def run (self): self.cited, self.download, self.fl = Spidermain (Self.sid, self. kanming). Craw (Root_url, Self.row) self.dic[str (self.row)]=result (Self.download, self.cited, Self.fl, self.kanming,
Self.row) class result (): Def __init__ (self, download, cited, FL, kanming, ROW): Super (). __INIT__ () Self.row = Row Self.kanming = kanming Self.fl = FL self.cited = cited self.download = Dow Nload def runn (SID, Kanming, I, D): AR, ref, FL = Spidermain (SID, Kanming). Craw (Root_url, row) d[str (i)]=result (a R, ref, FL, kanming, i) print (d) root_url = ' Https://apps.weboFknowledge.com/wos_generalsearch.do ' if __name__ = = "__main__": # sid= ' 6aylq8zfggvxdtactv9 ' root = ' http://www.w ebofknowledge.com/' s = requests.get (root) sid = Re.findall (R ' sid=\w+& ', S.url) [0].replace (' sid= ', ']. Replace
(' & ', ') data = Xlrd.open_workbook (' 2015 graduate dissertation published. xlsx ') Table = data.sheets () [0] nrows = table.nrows Ncols = Table.ncols CType = 1 XF = 0 threads = [] Threadnum = 5 D = Manager (). Dict () csv = open ('
2015_3.csv ', ' a ') fail = open (' Fail2015.txt ', ' a ') for I-in range (2, nrows): If i% 100 = 0: # every 100 times change sid S = requests.get (root) sid = Re.findall (R ' sid=\w+& ', S.url) [0].replace (' sid= ', '] . Replace (' & ', ') kanming = Table.cell (i, 5). Value T = mythread (SID, Kanming, I, D) Threads.ap
Pend (t) if I% Threadnum = = 0 or i = = Nrows-1: for T in Threads:try:
T.daemon = True T.start () except Requests.exceptions.ReadTimeout:continue
For T in Threads:t.join () to T in Threads:rst = D[STR (T.row)] Cited,download,flag = RST.CITED,RST.DOWNLOAD,RST.FL if Flag==1:fail.write (str (
i) + ' \ n ') Else:if Len (cited) ==0:cited.append (0)
Print (cited) if Len (download) ==0:download.append (0) Download.append (0) print (download) csv.write (str (i) + "," + str (cited[0
] + ', ' + str (download[0]) + ', ' + str (download[1]) + ' \ n ') threads = [] Csv.close () fail.close ()