I. Preface: the source code of a web page contains many nested labels.
For example, div label nesting: butp <div> finally <div> Aurora </div> @ 126.com</div> <Div class = \ "cited1 \"> ggff </div>
In our webpage parsing work, nesting is sometimes required. In general, nested labels are expressed in the form of linear tables. The above example is also used to illustrate the problem. That is, the solution is nested
<Div> Aurora </div>
<Div> finally <div> Aurora </div> @ 126.com</div>
Div class = \ "cited1 \"> ggff </div>
CoreCodeAs follows:
Code
Class Themeirassist
{
Public Static Void Getnodesbytags ( Ref String Rawtext, String Tags, Ref List < String > Result)
{ // Storage Start tag location
List < Position > Begintagpos = New List < Position > ();
// Storage end tag location
List < Position > Endtagpos = New List < Position > ();
// Regular Expression for matching start tags
String Sbegintagpattern = " < " + Tags;
RegEx regexbegintag = New RegEx (sbegintagpattern, regexoptions. ignorecase | Regexoptions. singleline );
// The regular expression that matches the end tag.
String Sendtagpattern = " </ " + Tags + " > " ;
RegEx regexendtag = New RegEx (sendtagpattern, regexoptions. singleline | Regexoptions. ignorecase );
// Get the set of start tags
Matchcollection begintagcollection = Regexbegintag. Matches (rawtext );
// Get the set of end tags
Matchcollection endtagcollection = Regexendtag. Matches (rawtext );
Foreach (Match mymatch In Begintagcollection)
{
Position POS = New Position ();
POs. NPOs = Mymatch. index;
POs. viststatus = False ;
Begintagpos. Add (POS );
}
Foreach (Match mymatch In Endtagcollection)
{
Position POS = New Position ();
POs. NPOs = Mymatch. index;
POs. viststatus = False ;
Endtagpos. Add (POS );
}
For ( Int I = 0 ; I < Endtagpos. Count; I ++ )
{
For ( Int J = Begintagpos. Count - 1 ; J > = 0 ; J -- )
{
If (Endtagpos [I]. NPOs < Begintagpos [J]. NPOs)
Continue ;
Else
{
If (Begintagpos [J]. viststatus)
Continue ;
Else
{
Result. Add (rawtext. substring (begintagpos [J]. NPOs, endtagpos [I]. NPOs - Begintagpos [J]. NPOs + 6 ));
Begintagpos [J]. viststatus = True ;
Break ;
}
}
}
}
}
}
Class Position
{
Private Int Pos;
Private Bool Visited;
Public Int NPOs
{
Get { Return Pos ;}
Set {Pos = Value ;}
}
Public Bool Viststatus
{
Get { Return Visited ;}
Set {Visited = Value ;}
}
}
The main function test is as follows:
Code
Static Void Main ( String [] ARGs)
{
Streamreader SR = New Streamreader ( " D: \ finally.txt " , Encoding. getencoding ( " UTF-8 " ));
String Rawtext = Sr. readtoend ();
// String rawtext = "butp <div> finally <div> Aurora </div> @ 126.com</div> <Div class = \" cited1 \ "> ggff </div> ";
List < String > Result = New List < String > ();
Themeirassist. getnodesbytags ( Ref Rawtext, " Div " , Ref Result );
Console. writeline (result. Count );
RegEx regexstadard = New RegEx ( @" ^ <Div \ s + class = "" cited1 "" " , Regexoptions. ignorecase | Regexoptions. singleline );
For ( Int I = Result. Count - 1 ; I > = 0 ; I -- )
{
If ( ! Regexstadard. ismatch (result [I])
Result. removeat (I );
}
Console. writeline (result. Count );
Foreach ( String S In Result)
{
Console. writeline ( " **************************************** *********** " );
Console. Write (s );
Console. writeline ( " **************************************** *********** " );
}
Console. Read ();
}