Links: http://www.1point3acres.com/bbs/thread-91000-1-1.html
If it is some relatively simple rules of web crawling, can use SAS, purely entertainment, SAS introductory words recommended SAS base and advance certified textbooks, these two certification is not much use, but the content of the textbook for a professional SAS programmer enough, SAS data Step, SQL, macro, these daily processes should be no problem.
%macro Webscholar;
The/*memlib option is to put data in memory, not on D, to increase the read speed, and finally to save the dataset */
Libname mywork "D:\" memlib;
/* Create an empty dataset of access results and grab the title and number of apps that google scholar has searched for * *
Proc SQL;
CREATE TABLE Mywork.results_web (titles char ($), Citenumber char (500));
Quit
/*pageno is equivalent to the next page of Google Scholar search, here try to crawl the first two pages, of course, you can add */
/*q=python After the python can be replaced by other keywords */
%do pageno = 0%to%by 10;. From 1point 3acres BBS
data _null_;
length URL $256;
url = ' http// scholar.google.com/scholar?start=0&q=python&hl=en&as_sdt=0,5 ';
url = prxchange ("s /start=0/start=&pageno/", 1, URL);
call symput ("url" , URL);
run;
/*recfm=n is to divide input into small chunks of length 256, because the SAS character variable is up to 30,000 longer, is truncated, and sometimes the line of code in the page can be long */
FileName Web url "%superq (URL)" Recfm=n debug;
/* $varying This format/informat is very interesting, specific reference to help*/
Data mywork.web;
Length Webtext $256;
infile Web Length=len;
Input Webtext $varying 256.len;
TextLength = Len;
Run
Data mywork.extracted;
Length S $32767; The longest character variable that/*sas can handle */
Length R $500;
Length cite;
Retain S; /* Every data step, the characters are added to S, with the retain,s not reset to the missing value */
Set Mywork.web;
s = cats (S, webtext);
. From 1point 3acres BBS
/* Use regular expressions to match the title and the number of citations */
/* Other programming languages can find many package to do, SAS this is not very convenient */
Position =.;
Do Until (position = 0);
Patternid = Prxparse ('/
Call Prxsubstr (Patternid, S, position, length);
If position ^= 0 then do;.
Patternid = Prxparse ('/
Call Prxsubstr (Patternid, S, position, length);
R = substr (s, position, length);. 1point3acres.com/bbs
/* Remove strange characters such as tag in the title */
r = Prxchange (' s/(<[^>]*?>) | ( \[[^\]]*?\])| (&[^;] *?; S?) ',-1, R);
s = substrn (s, position + length);
Patternid = Prxparse ('/>cite (d by)?? \d*<\/a>/i ');
Call Prxsubstr (Patternid, S, position, length);
cite = substr (s, position, length);
/* Extract the numbers */
cite = Prxchange (' s/(\d*) (\d*) (\d*)/$2/', 1, cite);
s = substrn (s, position + length);
Output
End
End . From 1point 3acres BBS
If length (s) > 29000 then s = substrn (s, 257);
Run
/* Save the results, the final data will have two variables, the title of the paper and the number of citations */
/* This DataSet is not saved to the hard drive because of the start memlib option */
Proc SQL;
INSERT INTO Mywork.results_web
Select R, cite from mywork.extracted;. From:1point3acres.com/bbs
Quit
%end;.
%mend Webscholar;
%webscholar
"Reprint" with SAS crawl Web data simple version