Although I have been using regular expressions in C ++ for some years, I have always been using the boost library. Frankly speaking, it is not very useful. Although I knew PCRE very early, I always thought it was a library developed for PHP. It's really a Martian. Bytes
When I mentioned this on the Tuesday two days ago, torch told me that PCRE is better than the regular expression library in boost. So I tried it and found that BCB had a PCRE, it is not mentioned in the document.
No
PCRE is a C language library, which is not easy to use. Although there are also C ++ encapsulation versions such as PCRE ++, it is difficult to port only GNU compilation configuration to BCB because
I have not used many features, so I made a simple package and used ansistring/stringlist of VCL. It is easy to use.
#include <pcre.h>
class TPCRE
{
private:
AnsiString FPattern;
pcre * FRE;
TStrings * FMatches;
public:
__fastcall TPCRE(AnsiString aPattern="");
__fastcall ~TPCRE();
void __fastcall compile(AnsiString aPattern="");
int __fastcall exec(AnsiString aStr); // return matched count
AnsiString __fastcall repeat_replace(AnsiString aStr, AnsiString aRepStr="");
__property TStrings * Matches = { read = FMatches };
};
__fastcall TPCRE::TPCRE(AnsiString aPattern)
: FRE(NULL), FMatches(new TStringList())
{
FPattern = aPattern;
if ( FPattern != "" )
compile();
}
__fastcall TPCRE::~TPCRE()
{
if (FRE)
free(FRE);
delete FMatches;
}
void __fastcall TPCRE::compile(AnsiString aPattern)
{
if ( aPattern != "" )
FPattern = aPattern;
const char * error;
int erroffset;
if (FRE)
free(FRE);
FRE = pcre_compile(FPattern.c_str(), 0, &error, &erroffset, NULL);
// if ( FRE == NULL )
// PCRE compilation failed at offset %d: %s, erroffset, error
}
int __fastcall TPCRE::exec(AnsiString aStr)
{
if (!FRE)
throw Exception("No pattern or have not be compiled!");
const int OVECCOUNT = 30;
int ovector[OVECCOUNT];
int rc = pcre_exec(FRE, NULL, aStr.c_str(), aStr.Length(), 0, ovector, OVECCOUNT);
if (rc < 0) {
if (rc == PCRE_ERROR_NOMATCH)
throw Exception("Sorry, no match ...");
else
throw Exception(AnsiString("Matching error ") + IntToStr(rc));
}
// OK, has matched ...
FMatches->Clear();
for (int i = 0; i < rc; i++)
FMatches->Add(aStr.SubString(
ovector[2*i]+1, ovector[2*i+1]-ovector[2*i]));
return rc;
}
AnsiString __fastcall TPCRE::repeat_replace(AnsiString aStr, AnsiString aRepStr)
{
if (!FRE)
throw Exception("No pattern or have not be compiled!");
const int OVECCOUNT = 30;
int ovector[OVECCOUNT];
int rc=1;
char *p = aStr.c_str();
int n = 1;
int len = aStr.Length();
AnsiString s="";
while (rc>0) {
rc = pcre_exec(FRE, NULL, p, len, 0, ovector, OVECCOUNT);
if (rc < 0) {
if (rc == PCRE_ERROR_NOMATCH) {
if (s=="")
s = aStr;
else
s += aStr.SubString(n,aStr.Length()-n+1);
break;
}
else
throw Exception(AnsiString("Matching error ") + IntToStr(rc));
}
// OK, has matched ...
s += aStr.SubString(n,ovector[0])+aRepStr;
n += ovector[1];
p = aStr.c_str()+n-1;
len = aStr.Length()-n+1;
}
return s;
}
The usage is simple. Here is a sample code that converts HTML to TXT:
// Input: const char * s
// Output: ansistring sresult
STD: auto_ptr <tpcre> Re (New tpcre ("(? IMS) <title> ([^ <] *);
Ansistring sresult;
If (re-> exec (s)> 1)
Sresult = Re-> matches-> strings [1]. Trim ();
Re-> compile ("(? IMS) <body [^>] *> (.*)");
If (re-> exec (s)> 1 ){
Ansistring STR = Re-> matches-> strings [1]. Trim ();
STR = stringreplace (STR, "/R", "", treplaceflags () <rfreplaceall );
STR = stringreplace (STR, "/N", "", treplaceflags () <rfreplaceall );
STR = stringreplace (STR, "& nbsp;", "", treplaceflags () <rfreplaceall );
// Replace <br/> to/R/n
Re-> compile ("(? I) <br/S */?> ");
STR = Re-> repeat_replace (STR, "/R/N ");
// Remove <script...>... </SCRIPT>
Re-> compile ("(? IMS) <SCRIPT [^>] *>. * </SCRIPT> ");
STR = Re-> repeat_replace (STR );
// Remove <! --... -->
Re-> compile ("(? IMS) <! --. * --> ");
STR = Re-> repeat_replace (STR );
// Remove <...>
Re-> compile ("(? IMS) <[^>] *> ");
STR = Re-> repeat_replace (STR );
Sresult + = STR;
}
This mainly involves extracting the title part and the body part, removing the carriage return of the body part, replacing & nbsp; with a space, replacing BR with a carriage return, removing scripts and comments, and finally removing all HTML tags.