Function extracthtmltagvalues (const htmltext: string; tagname, attribname: string; VaR values: tstrings): integer;
Function findfirstcharafterspace (const line: string; startpos: integer): integer;
VaR I: integer;
Begin
Result: =-1;
For I: = startpos to length (line) Do
Begin
If (line [I] <> '') then
Begin
Result: = I;
Exit;
End;
End;
End;
Function findfirstspaceafterchars (const line: string; startpos: integer): integer;
Begin
Result: = posex ('', line, startpos );
End;
Function findfirstspacebeforechars (const line: string; startpos: integer): integer;
VaR I: integer;
Begin
Result: = 1;
For I: = startpos downto 1 do
Begin
If (line [I] = '') then
Begin
Result: = I;
Exit;
End;
End;
End;
VaR innertag: string;
Lastpos, lastinnerpos: integer;
SPOs, LPOS, rpos: integer;
Attribvalue: string;
Closingchar: Char;
Tempattribname: string;
Begin
Result: = 0;
Lastpos: = 1;
While (true) Do
Begin
// Find outer tags' <'&'>'
LPOS: = posex ('<', htmltext, lastpos );
If (LPOS <= 0) Then break;
Rpos: = posex ('>', htmltext, LPOS + 1 );
If (rpos <= 0) then
Lastpos: = LPOS + 1
Else
Lastpos: = rpos + 1;
// Get inner tag
Innertag: = copy (htmltext, LPOS + 1, RPos-LPos-1 );
Innertag: = trim (innertag); // remove Spaces
If (length (innertag) <length (tagname) then continue;
// Check Tag Name
If (sametext (copy (innertag, 1, length (tagname), tagname) then
Begin
// Found tag
Attribvalue: = '';
Lastinnerpos: = length (tagname) + 1;
While (lastinnerpos <length (innertag) do
Begin
// Find first '=' After lastinnerpos
Rpos: = posex ('=', innertag, lastinnerpos );
If (rpos <= 0) Then break;
// This way you can check for multiple attrib names and not a specific B
SPOs: = findfirstspacebeforechars (innertag, rpos );
Tempattribname: = trim (copy (innertag, SPOs, rpos-SPOs ));
If (true) then
Begin
// Found correct tag
LPOS: = findfirstcharafterspace (innertag, rpos + 1 );
If (LPOS <= 0) then
Begin
Lastinnerpos: = rpos + 1;
Continue;
End;
LPOS: = findfirstcharafterspace (innertag, LPOS); // get to first char after '='
If (LPOS <= 0) then continue;
If (innertag [LPOS] <> '"') and (innertag [LPOS] <> ''') then
Begin
// Attribvalue is not between '"' or ''' so get it
Rpos: = findfirstspaceafterchars (innertag, LPOS + 1 );
If (rpos <= 0) then
Attribvalue: = copy (innertag, LPOS, length (innertag)-LPOS + 1)
Else
Attribvalue: = copy (innertag, LPOS, rpos-LPOS + 1 );
End
Else
Begin
// Get URL between' "'or '''
Closingchar: = innertag [LPOS];
Rpos: = posex (closingchar, innertag, LPOS + 1 );
If (rpos <= 0) then
Attribvalue: = copy (innertag, LPOS + 1, length (innertag)-LPos-1)
Else
Attribvalue: = copy (innertag, LPOS + 1, RPos-LPos-1)
End;
If (sametext (tempattribname, attribname) and (attribvalue <> '') then
Begin
Values. Add (attribvalue );
INC (result );
End;
End;
If (rpos <= 0) then
Lastinnerpos: = length (innertag)
Else
Lastinnerpos: = rpos + 1;
End;
End;
End;
End;
For eg. You want to extract all links in a page, just do:
VaR links: tstrings;
Begin
Links: = tstrings. Create;
Try
Linksfound: = extracthtmltagvalues (htmltext, 'A', 'href ', links );
Showmessage (links. Text );
Finally
Links. Free;
End;
End;
Original connection:
Http://www.delphi3000.com/articles/article_4365.asp? SK =