A class module of web crawler.

Last Update:2018-12-04 Source: Internet

Author: User

Tags stringreplace

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Imageurls returns the link of the analyzed image. Linkurls returns the webpage link obtained from the analysis. Bodytext returns the part of the webpage body removed from the mark. Titletext returns the title of the webpage.

{
A class for PAGE analysis
Author: Liu
Mail: GEO (DOT) CRA (AT) Gmail (DOT) com
Web: http://www.aivisoft.net/
}
Unit crawl;

Interface

Uses
Math, windows, sysutils, variants, classes, unitypes;

Const
Poolsize: longint = $100;
Sourcetags1: array [0 .. 9] of string = ('& lt;', '& gt;', '& amp;', '& quot;', '& reg ;',
'& Copy;', '& Trade;', '& ENSP;', '& emsp;', '& nbsp ;');
Sourcetags2: array [0 .. 9] of string = ('& lt', '& gt', '& amp', '& quot', '& reg ',
'& Copy',' & Trade ',' & ENSP ',' & emsp ',' & nbsp ');
Desttags: array [0 .. 9] of char = ('<', '> ','&','"','? ','? ','? ','','','');
Type
Tcrawler = Class
Private
Linkpool: array [0 .. $ ff] of tstringlist;
Function Hash (S: string): longint;
Function getplaintext (S: string): string;
Function naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
Public
Imageurls, linkurls: tstringlist;
Bodytext, titletext: string;
Procedure Init;
Procedure pageanalysis (srchtml, srcurl: string );
Function loadfromfile (filename: string): Boolean;
Function savetofile (filename: string): Boolean;
Destructor destroy; override;
End;

Implementation

Function tcrawler. Hash (S: string): longint;
VaR
I, total: longint;
Begin
Total: = 0;
For I: = 1 to length (s) do Inc (total, ord (s [I]);
Result: = total mod poolsize;
End;

Function tcrawler. getplaintext (S: string): string;
VaR
I, J, K, L, lengthofs, T1, T2, Y1, Y2: longint;
Cr, lf, Tab: Char;
Flags, hasspace: Boolean;
Lowers, news: string;
Begin
Cr: = CHR (13); lf: = CHR (10); tab: = CHR (9 );
K: = 1; I: = 1; lengthofs: = length (s); news: = s;
While I <= lengthofs do begin
Flags: = false;
While (news [I] = Cr) or (news [I] = lf) Do begin
INC (I); flags: = I> lengthofs;
If flags then break;
End;
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
{Clear enters in page}
News: = stringreplace (News, '</P>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '<br>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '</div>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<script', 1, lengthofs, 7); I: = K;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
While l> 0 do begin
L: = L + 9;
K: = naivematch (lowers, '<script', L, lengthofs, 7 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup scripts}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<style', 1, lengthofs, 6); I: = K;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
While l> 0 do begin
L: = L + 8;
K: = naivematch (lowers, '<style', L, lengthofs, 6 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup style code}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<', 1, lengthofs, 1); I: = K;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
While l> 0 do begin
Repeat
T1: = 0; t2: = 0;
For J: = K to l do begin
If lowers [J] = '"'Then Inc (T1 );
If lowers [J] = ''' then Inc (T2 );
End;
Y1: = T1 mod 2; Y2: = t2 mod 2;
If (Y1> 0) or (Y2> 0) then
L: = naivematch (lowers, '>', L + 1, lengthofs, 1 );
Until (L = 0) or (Y1 = 0) and (y2 = 0 ));
If l = 0 Then break;
{Ignore the> in "..." or '....'}
L: = L + 1;
K: = naivematch (lowers, '<', L, lengthofs, 1 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
End;
If I> 0 then setlength (News, I-1 );
{Clear control code in <>}
For I: = 0 to 9 do begin
News: = stringreplace (News, sourcetags1 [I], desttags [I], [rfreplaceall, rfignorecase]);
News: = stringreplace (News, sourcetags2 [I], desttags [I], [rfreplaceall, rfignorecase]);
End;
{Replace marks}
News: = stringreplace (News, '','', [rfreplaceall]);
Lengthofs: = length (News );
For I: = 1 to lengthofs do if news [I] = tab then news [I]: = '';
K: = 1; I: = 1; lengthofs: = length (News );
While I <= lengthofs do begin
Flags: = false; hasspace: = false;
While (news [I] = '') Do begin
INC (I); flags: = I> lengthofs;
Hasspace: = true;
If flags then break;
End;
If hasspace then Dec (I );
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
News: = stringreplace (News, LF + '', lf, [rfreplaceall]);
News: = stringreplace (News, ''+ Cr, Cr, [rfreplaceall]);
{Trim spaces and enters}
Result: = trim (News );
End;

Function tcrawler. naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
VaR
I, J, K: longint;
Success: Boolean;
Begin
Success: = false;
For I: = start to lengthofs do begin
Success: = true; K: = I;
For J: = 1 to lengthoft do begin
If s [k] <> T [J] Then begin
Success: = false;
Break;
End;
INC (k );
End;
If success then begin
Result: = I;
Break;
End;
End;
If not success then result: = 0;
End;

Procedure tcrawler. init;
VaR
I: longint;
Begin
Imageurls: = tstringlist. Create;
Linkurls: = tstringlist. Create;
For I: = 0 to poolsize-1 do begin
Linkpool [I]: = tstringlist. Create;
Linkpool [I]. Sorted: = true;
End;
End;

Procedure tcrawler. pageanalysis (srchtml, srcurl: string );
VaR
I, J, K, L, lengthofhtml, hashcode: longint;
Strquot, strspace, strtriangle, strquot2, strnewline, strcross: longint;
Rooturl, hostname, lowerhtml, suburl, desturl, header: string;
Begin
Imageurls. Clear; linkurls. Clear;
Rooturl: = srcurl; header: = 'HTTP ://';
If lowercase (copy (rooturl, 1, 6) = 'ftp: // 'then begin
Delete (rooturl, 1, 6 );
Header: = 'ftp ://';
End;
If lowercase (copy (rooturl, 1, 7) = 'HTTP: // 'then Delete (rooturl, 1, 7 );
If lowercase (copy (rooturl, 1, 8) = 'https: // 'then begin
Delete (rooturl, 1, 8 );
Header: = 'https ://';
End;
While rooturl [length (rooturl)] = '/' Do begin
Delete (rooturl, length (rooturl), 1 );
If rooturl = ''then break;
End;
If rooturl = ''then exit;
K: = pos ('/', rooturl );
If K> 0 then hostname: = copy (rooturl, 1, k-1) else hostname: = rooturl;
Lengthofhtml: = length (srchtml );
Lowerhtml: = lowercase (srchtml );
{Parsing links}
K: = naivematch (lowerhtml, '<a href =', 1, lengthofhtml, 8 );
While K> 0 do begin
K: = K + 8; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strcross: = naivematch (lowerhtml, '#', K, lengthofhtml, 1 );
If (strcross <L) and (strcross> 0) then l: = strcross;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Linkurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
If (lowercase (copy (desturl, length (desturl)-3, 4) = '.jpg ') or
(Lowercase (copy (desturl, length (desturl)-3, 4) = '.bmp ') or
(Lowercase (copy (desturl, length (desturl)-4, 5) = '.jpeg ') then begin
Imageurls. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, '<a href =', L, lengthofhtml, 8 );
End else break;
End;
{Parsing image links}
K: = naivematch (lowerhtml, 'While K> 0 do begin
K: = K + 9; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If (lowercase (copy (suburl, length (suburl)-3, 4) = '.jpg ') or
(Lowercase (copy (suburl, length (suburl)-3, 4) = '.bmp ') or
(Lowercase (copy (suburl, length (suburl)-4, 5) = '.jpeg ') then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Imageurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, 'End else break;
End;
{Get title text}
Titletext: = '';
K: = naivematch (lowerhtml, '<title>', 1, lengthofhtml, 7 );
If K> 0 then begin
K: = K + 7;
L: = naivematch (lowerhtml, '</title>', K, lengthofhtml, 8 );
If l> 0 then
Titletext: = copy (srchtml, K, L-k );
End;
Titletext: = getplaintext (titletext );
{Get body text}
Bodytext: = '';
K: = naivematch (lowerhtml, '<body', 1, lengthofhtml, 5 );
If K> 0 then begin
K: = naivematch (lowerhtml, '>', K + 5, lengthofhtml, 1 );
If K> 0 then begin
K: = k + 1;
L: = naivematch (lowerhtml, '</body>', K, lengthofhtml, 7 );
If l = 0 then l: = lengthofhtml;
Bodytext: = copy (srchtml, K, L-k );
End;
End;
Bodytext: = getplaintext (bodytext );
End;

Function tcrawler. loadfromfile (filename: string): Boolean;
VaR
I, j, N: longint;
S: string;
Begin
Try
Assignfile (input, filename); reset (input );
For I: = 0 to poolsize-1 do begin
Readln (N );
For J: = 0 to n-1 do begin
Readln (s );
Linkpool [I]. Add (s );
End;
End;
Closefile (input );
Result: = true;
Except
Result: = false;
End;
End;

Function tcrawler. savetofile (filename: string): Boolean;
VaR
I, J: longint;
Begin
Try
Assignfile (output, filename); rewrite (output );
For I: = 0 to poolsize-1 do begin
Writeln (linkpool [I]. Count );
For J: = 0 to linkpool [I]. Count-1 do writeln (linkpool [I]. Strings [J]);
End;
Closefile (output );
Result: = true;
Except
Result: = false;
End;
End;

Destructor tcrawler. Destroy;
VaR
I: longint;
Begin
Imageurls. Free;
Linkurls. Free;
For I: = 0 to poolsize-1 do linkpool [I]. Free;
Inherited;
End;

End.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More