This article evaluates the Regular Expression Library RegEx and PCRE of C in the domain name verification scenario.
The regular expression used to verify the DNS domain name is:
"^ [0-9a-za-z _-] + (\\. [0-9a-za-z _-] + )*(\\. [A-Za-Z] {2 ,}\\.) $"
Perform regular verification on 6177578 logs in normal DNS request logs.
1, PCRE
The PCRE version used for evaluation is 7.8.3.
#include <stdio.h>#include <string.h>#include <pcre.h>#define OVECCOUNT 30 /* should be a multiple of 3 */#define EBUFLEN 128#define BUFLEN 1024int main(int argc, char *argv[]){ pcre *re; const char *error; int erroffset; FILE *fd; int ovector[OVECCOUNT]; int rc, i; int succ = 0, fail = 0; char src[1024]; char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"; printf("Pattern: %s\n", pattern); re = pcre_compile(pattern, 0, &error, &erroffset, NULL); if (re == NULL) { printf("PCRE compilation failed at offset %d: %s\n", erroffset, error); return 1; } if ((fd = fopen(argv[1], "r")) == NULL) { printf("open file error\n"); return 1; } while(fgets(src, 1024, fd)) { rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT); if (rc < 0) { fail++; } else { succ++; } } printf("success:%d fail:%d\n", succ, fail); fclose(fd); free(re); return 0;}
The time consumed for processing all the data is:
$ Time./pcre_t query_domains
Pattern: ^ [0-9a-za-z _-] + (\. [0-9a-za-z _-] +) * (\. [A-Za-Z] {2,} \.) $
Success: 6177443 fail: 135
Real0m8.257s
User0m8.194s
Sys0m0.058s
2, RegEx
#include <stdio.h>#include <string.h>#include <regex.h>#define SUBSLEN 100 #define EBUFLEN 1280 #define BUFLEN 1024 int main(int argc, char *argv[]){ size_t len; regex_t re; regmatch_t subs[SUBSLEN]; char matched[BUFLEN]; char errbuf[EBUFLEN]; int err, i, succ=0, fail=0; FILE *fd; char *src; char line[1024]; char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]+\\.)$"; printf("Pattern: %s\n", pattern); if(regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE)) { len = regerror(err, &re, errbuf, sizeof(errbuf)); printf("error: regcomp: %s\n", errbuf); return 1; } if ((fd = fopen(argv[1], "r")) == NULL) { printf("open file error\n"); return 1; } while(fgets(line, 1024, fd)) { err = regexec(&re, line, (size_t) SUBSLEN, subs, 0); if (err == REG_NOMATCH) { fail++; } else { succ++; } } printf("success:%d, fails:%d\n", succ, fail); fclose(fd); regfree(&re); return (0);}
Time consumed for processing all data:
$ Time./regex_t query_domains
Pattern: ^ [0-9a-za-z _-] + (\. [0-9a-za-z _-] +) * (\. [A-Za-Z] + \.) $
Success: 6177443, fails: 135
Real0m50.876s
User0m50.783s
Sys0m0.058s
3. Conclusion.
We can see that the domain name verification scenario. PCRE is significantly better than the POSIX RegEx library. When the rules have been compiled, PCRE processes about 0.121 million domain names per second, and RegEx processes about Domain Names per second.