In this paper, the regular library of C regex and Pcre in the domain validation of the scene to do the evaluation.
The regular expression that validates the DNS domain name is:
"^[0-9a-za-z_-]+ (\\.[ 0-9a-za-z_-]+) * (\\.[ A-za-z]{2,}\\.) $"
Regular validation processing is done for 6,177,578 logs in the normal DNS request log.
1,pcre
The version number of the pcre used for the evaluation is: 7.8.3
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
#include <stdio.h>
#include <string.h>
#include <pcre.h>
#define OVECCOUNT 30 /* should be a multiple of 3 */
#define EBUFLEN 128
#define BUFLEN 1024 int
main(
int
argc,
char
*argv[])
{
pcre *re;
const
char
*error;
int
erroffset;
FILE
*fd;
int
ovector[OVECCOUNT];
int
rc, i;
int
succ = 0, fail = 0;
char
src[1024];
char
pattern[] =
"^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"
;
printf
(
"Pattern: %s\n"
, pattern);
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if
(re == NULL) {
printf
(
"PCRE compilation failed at offset %d: %s\n"
, erroffset, error);
return
1;
}
if
((fd =
fopen
(argv[1],
"r"
)) == NULL) {
printf
(
"open file error\n"
);
return
1;
}
while
(
fgets
(src, 1024, fd)) {
rc = pcre_exec(re, NULL, src,
strlen
(src), 0, 0, ovector, OVECCOUNT);
if
(rc < 0) {
fail++;
}
else
{
succ++;
}
}
printf
(
"success:%d fail:%d\n"
, succ, fail);
fclose
(fd);
free
(re);
return
0;
}
|
The time it takes to process all the data is:
$time./pcre_t Query_domains
Pattern: ^[0-9a-za-z_-]+ (\.[ 0-9a-za-z_-]+) * (\.[ A-za-z]{2,}\.) $
success:6177443 fail:135
real0m8.257s
user0m8.194s
sys0m0.058s
2,regex
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
#include <stdio.h>
#include <string.h>
#include <regex.h>
#define SUBSLEN 100
#define EBUFLEN 1280
#define BUFLEN 1024
int
main(
int
argc,
char
*argv[])
{
size_t
len;
regex_t re;
regmatch_t subs[SUBSLEN];
char
matched[BUFLEN];
char
errbuf[EBUFLEN];
int err, i, succ=0, fail=0;
FILE
*fd;
char
*src;
char
line[1024];
char
pattern[] =
"^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]+\\.)$"
;
printf
(
"Pattern: %s\n"
, pattern);
if
(regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE)) {
len = regerror(err, &re, errbuf,
sizeof
(errbuf));
printf
(
"error: regcomp: %s\n"
, errbuf);
return
1;
}
if
((fd =
fopen
(argv[1],
"r"
)) == NULL) {
printf
(
"open file error\n"
);
return
1;
}
while
(
fgets
(line, 1024, fd)) {
err = regexec(&re, line, (
size_t
) SUBSLEN, subs, 0);
if (err == REG_NOMATCH) {
fail++;
}
else
{
succ++;
}
}
printf
(
"success:%d, fails:%d\n"
, succ, fail);
fclose
(fd);
regfree(&re);
return
(0);
}
|
Processing all data takes time:
$time./regex_t Query_domains
Pattern: ^[0-9a-za-z_-]+ (\.[ 0-9a-za-z_-]+) * (\.[ A-za-z]+\.) $
success:6177443, fails:135
real0m50.876s
user0m50.783s
sys0m0.058s
3, conclusion.
As you can see, for domain name validation scenarios. The pcre is significantly better than the POSIX regex library. When the rules have been compiled, the pcre processes about 74.8w domain names per second, while the regex processes approximately 121,000 per second.
C Regular Library Performance comparison for DNS domain name verification