A class module of web crawler.

Source: Internet
Author: User
Tags stringreplace
Imageurls returns the link of the analyzed image. Linkurls returns the webpage link obtained from the analysis. Bodytext returns the part of the webpage body removed from the mark. Titletext returns the title of the webpage.

{
A class for PAGE analysis
Author: Liu
Mail: GEO (DOT) CRA (AT) Gmail (DOT) com
Web: http://www.aivisoft.net/
}
Unit crawl;

Interface

Uses
Math, windows, sysutils, variants, classes, unitypes;

Const
Poolsize: longint = $100;
Sourcetags1: array [0 .. 9] of string = ('& lt;', '& gt;', '& amp;', '& quot;', '& reg ;',
'& Copy;', '& Trade;', '& ENSP;', '& emsp;', '& nbsp ;');
Sourcetags2: array [0 .. 9] of string = ('& lt', '& gt', '& amp', '& quot', '& reg ',
'& Copy',' & Trade ',' & ENSP ',' & emsp ',' & nbsp ');
Desttags: array [0 .. 9] of char = ('<', '> ','&','"','? ','? ','? ','','','');
Type
Tcrawler = Class
Private
Linkpool: array [0 .. $ ff] of tstringlist;
Function Hash (S: string): longint;
Function getplaintext (S: string): string;
Function naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
Public
Imageurls, linkurls: tstringlist;
Bodytext, titletext: string;
Procedure Init;
Procedure pageanalysis (srchtml, srcurl: string );
Function loadfromfile (filename: string): Boolean;
Function savetofile (filename: string): Boolean;
Destructor destroy; override;
End;

Implementation

Function tcrawler. Hash (S: string): longint;
VaR
I, total: longint;
Begin
Total: = 0;
For I: = 1 to length (s) do Inc (total, ord (s [I]);
Result: = total mod poolsize;
End;

Function tcrawler. getplaintext (S: string): string;
VaR
I, J, K, L, lengthofs, T1, T2, Y1, Y2: longint;
Cr, lf, Tab: Char;
Flags, hasspace: Boolean;
Lowers, news: string;
Begin
Cr: = CHR (13); lf: = CHR (10); tab: = CHR (9 );
K: = 1; I: = 1; lengthofs: = length (s); news: = s;
While I <= lengthofs do begin
Flags: = false;
While (news [I] = Cr) or (news [I] = lf) Do begin
INC (I); flags: = I> lengthofs;
If flags then break;
End;
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
{Clear enters in page}
News: = stringreplace (News, '</P>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '<br>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '</div>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<script', 1, lengthofs, 7); I: = K;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
While l> 0 do begin
L: = L + 9;
K: = naivematch (lowers, '<script', L, lengthofs, 7 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup scripts}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<style', 1, lengthofs, 6); I: = K;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
While l> 0 do begin
L: = L + 8;
K: = naivematch (lowers, '<style', L, lengthofs, 6 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup style code}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<', 1, lengthofs, 1); I: = K;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
While l> 0 do begin
Repeat
T1: = 0; t2: = 0;
For J: = K to l do begin
If lowers [J] = '"'Then Inc (T1 );
If lowers [J] = ''' then Inc (T2 );
End;
Y1: = T1 mod 2; Y2: = t2 mod 2;
If (Y1> 0) or (Y2> 0) then
L: = naivematch (lowers, '>', L + 1, lengthofs, 1 );
Until (L = 0) or (Y1 = 0) and (y2 = 0 ));
If l = 0 Then break;
{Ignore the> in "..." or '....'}
L: = L + 1;
K: = naivematch (lowers, '<', L, lengthofs, 1 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
End;
If I> 0 then setlength (News, I-1 );
{Clear control code in <>}
For I: = 0 to 9 do begin
News: = stringreplace (News, sourcetags1 [I], desttags [I], [rfreplaceall, rfignorecase]);
News: = stringreplace (News, sourcetags2 [I], desttags [I], [rfreplaceall, rfignorecase]);
End;
{Replace marks}
News: = stringreplace (News, '','', [rfreplaceall]);
Lengthofs: = length (News );
For I: = 1 to lengthofs do if news [I] = tab then news [I]: = '';
K: = 1; I: = 1; lengthofs: = length (News );
While I <= lengthofs do begin
Flags: = false; hasspace: = false;
While (news [I] = '') Do begin
INC (I); flags: = I> lengthofs;
Hasspace: = true;
If flags then break;
End;
If hasspace then Dec (I );
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
News: = stringreplace (News, LF + '', lf, [rfreplaceall]);
News: = stringreplace (News, ''+ Cr, Cr, [rfreplaceall]);
{Trim spaces and enters}
Result: = trim (News );
End;

Function tcrawler. naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
VaR
I, J, K: longint;
Success: Boolean;
Begin
Success: = false;
For I: = start to lengthofs do begin
Success: = true; K: = I;
For J: = 1 to lengthoft do begin
If s [k] <> T [J] Then begin
Success: = false;
Break;
End;
INC (k );
End;
If success then begin
Result: = I;
Break;
End;
End;
If not success then result: = 0;
End;

Procedure tcrawler. init;
VaR
I: longint;
Begin
Imageurls: = tstringlist. Create;
Linkurls: = tstringlist. Create;
For I: = 0 to poolsize-1 do begin
Linkpool [I]: = tstringlist. Create;
Linkpool [I]. Sorted: = true;
End;
End;

Procedure tcrawler. pageanalysis (srchtml, srcurl: string );
VaR
I, J, K, L, lengthofhtml, hashcode: longint;
Strquot, strspace, strtriangle, strquot2, strnewline, strcross: longint;
Rooturl, hostname, lowerhtml, suburl, desturl, header: string;
Begin
Imageurls. Clear; linkurls. Clear;
Rooturl: = srcurl; header: = 'HTTP ://';
If lowercase (copy (rooturl, 1, 6) = 'ftp: // 'then begin
Delete (rooturl, 1, 6 );
Header: = 'ftp ://';
End;
If lowercase (copy (rooturl, 1, 7) = 'HTTP: // 'then Delete (rooturl, 1, 7 );
If lowercase (copy (rooturl, 1, 8) = 'https: // 'then begin
Delete (rooturl, 1, 8 );
Header: = 'https ://';
End;
While rooturl [length (rooturl)] = '/' Do begin
Delete (rooturl, length (rooturl), 1 );
If rooturl = ''then break;
End;
If rooturl = ''then exit;
K: = pos ('/', rooturl );
If K> 0 then hostname: = copy (rooturl, 1, k-1) else hostname: = rooturl;
Lengthofhtml: = length (srchtml );
Lowerhtml: = lowercase (srchtml );
{Parsing links}
K: = naivematch (lowerhtml, '<a href =', 1, lengthofhtml, 8 );
While K> 0 do begin
K: = K + 8; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strcross: = naivematch (lowerhtml, '#', K, lengthofhtml, 1 );
If (strcross <L) and (strcross> 0) then l: = strcross;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Linkurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
If (lowercase (copy (desturl, length (desturl)-3, 4) = '.jpg ') or
(Lowercase (copy (desturl, length (desturl)-3, 4) = '.bmp ') or
(Lowercase (copy (desturl, length (desturl)-4, 5) = '.jpeg ') then begin
Imageurls. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, '<a href =', L, lengthofhtml, 8 );
End else break;
End;
{Parsing image links}
K: = naivematch (lowerhtml, 'While K> 0 do begin
K: = K + 9; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If (lowercase (copy (suburl, length (suburl)-3, 4) = '.jpg ') or
(Lowercase (copy (suburl, length (suburl)-3, 4) = '.bmp ') or
(Lowercase (copy (suburl, length (suburl)-4, 5) = '.jpeg ') then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Imageurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, 'End else break;
End;
{Get title text}
Titletext: = '';
K: = naivematch (lowerhtml, '<title>', 1, lengthofhtml, 7 );
If K> 0 then begin
K: = K + 7;
L: = naivematch (lowerhtml, '</title>', K, lengthofhtml, 8 );
If l> 0 then
Titletext: = copy (srchtml, K, L-k );
End;
Titletext: = getplaintext (titletext );
{Get body text}
Bodytext: = '';
K: = naivematch (lowerhtml, '<body', 1, lengthofhtml, 5 );
If K> 0 then begin
K: = naivematch (lowerhtml, '>', K + 5, lengthofhtml, 1 );
If K> 0 then begin
K: = k + 1;
L: = naivematch (lowerhtml, '</body>', K, lengthofhtml, 7 );
If l = 0 then l: = lengthofhtml;
Bodytext: = copy (srchtml, K, L-k );
End;
End;
Bodytext: = getplaintext (bodytext );
End;

Function tcrawler. loadfromfile (filename: string): Boolean;
VaR
I, j, N: longint;
S: string;
Begin
Try
Assignfile (input, filename); reset (input );
For I: = 0 to poolsize-1 do begin
Readln (N );
For J: = 0 to n-1 do begin
Readln (s );
Linkpool [I]. Add (s );
End;
End;
Closefile (input );
Result: = true;
Except
Result: = false;
End;
End;

Function tcrawler. savetofile (filename: string): Boolean;
VaR
I, J: longint;
Begin
Try
Assignfile (output, filename); rewrite (output );
For I: = 0 to poolsize-1 do begin
Writeln (linkpool [I]. Count );
For J: = 0 to linkpool [I]. Count-1 do writeln (linkpool [I]. Strings [J]);
End;
Closefile (output );
Result: = true;
Except
Result: = false;
End;
End;

Destructor tcrawler. Destroy;
VaR
I: longint;
Begin
Imageurls. Free;
Linkurls. Free;
For I: = 0 to poolsize-1 do linkpool [I]. Free;
Inherited;
End;

End.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.