Imageurls returns the link of the analyzed image. Linkurls returns the webpage link obtained from the analysis. Bodytext returns the part of the webpage body removed from the mark. Titletext returns the title of the webpage.
{
A class for PAGE analysis
Author: Liu
Mail: GEO (DOT) CRA (AT) Gmail (DOT) com
Web: http://www.aivisoft.net/
}
Unit crawl;
Interface
Uses
Math, windows, sysutils, variants, classes, unitypes;
Const
Poolsize: longint = $100;
Sourcetags1: array [0 .. 9] of string = ('& lt;', '& gt;', '& amp;', '& quot;', '& reg ;',
'& Copy;', '& Trade;', '& ENSP;', '& emsp;', '& nbsp ;');
Sourcetags2: array [0 .. 9] of string = ('& lt', '& gt', '& amp', '& quot', '& reg ',
'& Copy',' & Trade ',' & ENSP ',' & emsp ',' & nbsp ');
Desttags: array [0 .. 9] of char = ('<', '> ','&','"','? ','? ','? ','','','');
Type
Tcrawler = Class
Private
Linkpool: array [0 .. $ ff] of tstringlist;
Function Hash (S: string): longint;
Function getplaintext (S: string): string;
Function naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
Public
Imageurls, linkurls: tstringlist;
Bodytext, titletext: string;
Procedure Init;
Procedure pageanalysis (srchtml, srcurl: string );
Function loadfromfile (filename: string): Boolean;
Function savetofile (filename: string): Boolean;
Destructor destroy; override;
End;
Implementation
Function tcrawler. Hash (S: string): longint;
VaR
I, total: longint;
Begin
Total: = 0;
For I: = 1 to length (s) do Inc (total, ord (s [I]);
Result: = total mod poolsize;
End;
Function tcrawler. getplaintext (S: string): string;
VaR
I, J, K, L, lengthofs, T1, T2, Y1, Y2: longint;
Cr, lf, Tab: Char;
Flags, hasspace: Boolean;
Lowers, news: string;
Begin
Cr: = CHR (13); lf: = CHR (10); tab: = CHR (9 );
K: = 1; I: = 1; lengthofs: = length (s); news: = s;
While I <= lengthofs do begin
Flags: = false;
While (news [I] = Cr) or (news [I] = lf) Do begin
INC (I); flags: = I> lengthofs;
If flags then break;
End;
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
{Clear enters in page}
News: = stringreplace (News, '</P>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '<br>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
News: = stringreplace (News, '</div>', CHR (13) + CHR (10), [rfreplaceall, rfignorecase]);
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<script', 1, lengthofs, 7); I: = K;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
While l> 0 do begin
L: = L + 9;
K: = naivematch (lowers, '<script', L, lengthofs, 7 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</SCRIPT>', K + 7, lengthofs, 9 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup scripts}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<style', 1, lengthofs, 6); I: = K;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
While l> 0 do begin
L: = L + 8;
K: = naivematch (lowers, '<style', L, lengthofs, 6 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '</style>', K + 6, lengthofs, 8 );
End;
If I> 0 then setlength (News, I-1 );
{Clearup style code}
Lowers: = lowercase (News); lengthofs: = length (News );
K: = naivematch (lowers, '<', 1, lengthofs, 1); I: = K;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
While l> 0 do begin
Repeat
T1: = 0; t2: = 0;
For J: = K to l do begin
If lowers [J] = '"'Then Inc (T1 );
If lowers [J] = ''' then Inc (T2 );
End;
Y1: = T1 mod 2; Y2: = t2 mod 2;
If (Y1> 0) or (Y2> 0) then
L: = naivematch (lowers, '>', L + 1, lengthofs, 1 );
Until (L = 0) or (Y1 = 0) and (y2 = 0 ));
If l = 0 Then break;
{Ignore the> in "..." or '....'}
L: = L + 1;
K: = naivematch (lowers, '<', L, lengthofs, 1 );
If K = 0 then K: = lengthofs + 1;
Move (news [L], news [I], K-l );
I: = I + k-l;
L: = naivematch (lowers, '>', k + 1, lengthofs, 1 );
End;
If I> 0 then setlength (News, I-1 );
{Clear control code in <>}
For I: = 0 to 9 do begin
News: = stringreplace (News, sourcetags1 [I], desttags [I], [rfreplaceall, rfignorecase]);
News: = stringreplace (News, sourcetags2 [I], desttags [I], [rfreplaceall, rfignorecase]);
End;
{Replace marks}
News: = stringreplace (News, '','', [rfreplaceall]);
Lengthofs: = length (News );
For I: = 1 to lengthofs do if news [I] = tab then news [I]: = '';
K: = 1; I: = 1; lengthofs: = length (News );
While I <= lengthofs do begin
Flags: = false; hasspace: = false;
While (news [I] = '') Do begin
INC (I); flags: = I> lengthofs;
Hasspace: = true;
If flags then break;
End;
If hasspace then Dec (I );
If not flags then news [k]: = News [I] else Dec (k );
INC (I); Inc (k );
End;
Setlength (News, k-1 );
News: = stringreplace (News, LF + '', lf, [rfreplaceall]);
News: = stringreplace (News, ''+ Cr, Cr, [rfreplaceall]);
{Trim spaces and enters}
Result: = trim (News );
End;
Function tcrawler. naivematch (S, T: string; Start, lengthofs, lengthoft: longint): longint;
VaR
I, J, K: longint;
Success: Boolean;
Begin
Success: = false;
For I: = start to lengthofs do begin
Success: = true; K: = I;
For J: = 1 to lengthoft do begin
If s [k] <> T [J] Then begin
Success: = false;
Break;
End;
INC (k );
End;
If success then begin
Result: = I;
Break;
End;
End;
If not success then result: = 0;
End;
Procedure tcrawler. init;
VaR
I: longint;
Begin
Imageurls: = tstringlist. Create;
Linkurls: = tstringlist. Create;
For I: = 0 to poolsize-1 do begin
Linkpool [I]: = tstringlist. Create;
Linkpool [I]. Sorted: = true;
End;
End;
Procedure tcrawler. pageanalysis (srchtml, srcurl: string );
VaR
I, J, K, L, lengthofhtml, hashcode: longint;
Strquot, strspace, strtriangle, strquot2, strnewline, strcross: longint;
Rooturl, hostname, lowerhtml, suburl, desturl, header: string;
Begin
Imageurls. Clear; linkurls. Clear;
Rooturl: = srcurl; header: = 'HTTP ://';
If lowercase (copy (rooturl, 1, 6) = 'ftp: // 'then begin
Delete (rooturl, 1, 6 );
Header: = 'ftp ://';
End;
If lowercase (copy (rooturl, 1, 7) = 'HTTP: // 'then Delete (rooturl, 1, 7 );
If lowercase (copy (rooturl, 1, 8) = 'https: // 'then begin
Delete (rooturl, 1, 8 );
Header: = 'https ://';
End;
While rooturl [length (rooturl)] = '/' Do begin
Delete (rooturl, length (rooturl), 1 );
If rooturl = ''then break;
End;
If rooturl = ''then exit;
K: = pos ('/', rooturl );
If K> 0 then hostname: = copy (rooturl, 1, k-1) else hostname: = rooturl;
Lengthofhtml: = length (srchtml );
Lowerhtml: = lowercase (srchtml );
{Parsing links}
K: = naivematch (lowerhtml, '<a href =', 1, lengthofhtml, 8 );
While K> 0 do begin
K: = K + 8; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strcross: = naivematch (lowerhtml, '#', K, lengthofhtml, 1 );
If (strcross <L) and (strcross> 0) then l: = strcross;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Linkurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
If (lowercase (copy (desturl, length (desturl)-3, 4) = '.jpg ') or
(Lowercase (copy (desturl, length (desturl)-3, 4) = '.bmp ') or
(Lowercase (copy (desturl, length (desturl)-4, 5) = '.jpeg ') then begin
Imageurls. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, '<a href =', L, lengthofhtml, 8 );
End else break;
End;
{Parsing image links}
K: = naivematch (lowerhtml, 'While K> 0 do begin
K: = K + 9; L: = maxlongint;
Strquot: = naivematch (lowerhtml, CHR (39), k + 1, lengthofhtml, 1 );
If (strquot <L) and (strquot> 0) then l: = strquot;
Strtriangle: = naivematch (lowerhtml, '>', K, lengthofhtml, 1 );
If (strtriangle <L) and (strtriangle> 0) then l: = strtriangle;
Strspace: = naivematch (lowerhtml, '', K, lengthofhtml, 1 );
If (strspace <L) and (strspace> 0) then l: = strspace;
Strquot2: = naivematch (lowerhtml, '"', k + 1, lengthofhtml, 1 );
If (strquot2 <L) and (strquot2> 0) then l: = strquot2;
Strnewline: = naivematch (lowerhtml, CHR (10), K, lengthofhtml, 1 );
If (strnewline <L) and (strnewline> 0) then l: = strnewline;
If l <maxlongint then begin
Suburl: = trimright (copy (srchtml, K, L-k ));
If suburl <> ''then begin
While suburl [1] = '"'Do in
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
While suburl [1] = CHR (39) Do begin
Delete (suburl, 1, 1 );
If suburl = ''then break;
End;
If suburl <> ''then begin
If (lowercase (copy (suburl, length (suburl)-3, 4) = '.jpg ') or
(Lowercase (copy (suburl, length (suburl)-3, 4) = '.bmp ') or
(Lowercase (copy (suburl, length (suburl)-4, 5) = '.jpeg ') then begin
If ('ftp: // '= lowercase (copy (suburl, 1, 6) or
('HTTP: // '= lowercase (copy (suburl, 1, 7) or
('Https: // '= lowercase (copy (suburl, 1, 8) then
Desturl: = suburl
Else begin
If suburl [1] = '/' then
Desturl: = header + hostname + suburl
Else
Desturl: = header + rooturl + '/' + suburl;
End;
Hashcode: = hash (desturl );
If linkpool [hashcode]. indexof (desturl) =-1 then begin
Imageurls. Add (desturl );
Linkpool [hashcode]. Add (desturl );
End;
End;
End;
End;
End;
K: = naivematch (lowerhtml, 'End else break;
End;
{Get title text}
Titletext: = '';
K: = naivematch (lowerhtml, '<title>', 1, lengthofhtml, 7 );
If K> 0 then begin
K: = K + 7;
L: = naivematch (lowerhtml, '</title>', K, lengthofhtml, 8 );
If l> 0 then
Titletext: = copy (srchtml, K, L-k );
End;
Titletext: = getplaintext (titletext );
{Get body text}
Bodytext: = '';
K: = naivematch (lowerhtml, '<body', 1, lengthofhtml, 5 );
If K> 0 then begin
K: = naivematch (lowerhtml, '>', K + 5, lengthofhtml, 1 );
If K> 0 then begin
K: = k + 1;
L: = naivematch (lowerhtml, '</body>', K, lengthofhtml, 7 );
If l = 0 then l: = lengthofhtml;
Bodytext: = copy (srchtml, K, L-k );
End;
End;
Bodytext: = getplaintext (bodytext );
End;
Function tcrawler. loadfromfile (filename: string): Boolean;
VaR
I, j, N: longint;
S: string;
Begin
Try
Assignfile (input, filename); reset (input );
For I: = 0 to poolsize-1 do begin
Readln (N );
For J: = 0 to n-1 do begin
Readln (s );
Linkpool [I]. Add (s );
End;
End;
Closefile (input );
Result: = true;
Except
Result: = false;
End;
End;
Function tcrawler. savetofile (filename: string): Boolean;
VaR
I, J: longint;
Begin
Try
Assignfile (output, filename); rewrite (output );
For I: = 0 to poolsize-1 do begin
Writeln (linkpool [I]. Count );
For J: = 0 to linkpool [I]. Count-1 do writeln (linkpool [I]. Strings [J]);
End;
Closefile (output );
Result: = true;
Except
Result: = false;
End;
End;
Destructor tcrawler. Destroy;
VaR
I: longint;
Begin
Imageurls. Free;
Linkurls. Free;
For I: = 0 to poolsize-1 do linkpool [I]. Free;
Inherited;
End;
End.