標籤:blog os io 資料 ar div amp log
本文對C的正則庫regex和pcre在做網域名稱驗證的情境下做評測。
驗證DNS網域名稱的Regex為:
"^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"
對於正常DNS請求日誌中的6177578條日誌做正則驗證處理。
1,pcre
評測所用的pcre的版本號碼是:7.8.3
#include <stdio.h>#include <string.h>#include <pcre.h>#define OVECCOUNT 30 /* should be a multiple of 3 */#define EBUFLEN 128#define BUFLEN 1024int main(int argc, char *argv[]){ pcre *re; const char *error; int erroffset; FILE *fd; int ovector[OVECCOUNT]; int rc, i; int succ = 0, fail = 0; char src[1024]; char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"; printf("Pattern: %s\n", pattern); re = pcre_compile(pattern, 0, &error, &erroffset, NULL); if (re == NULL) { printf("PCRE compilation failed at offset %d: %s\n", erroffset, error); return 1; } if ((fd = fopen(argv[1], "r")) == NULL) { printf("open file error\n"); return 1; } while(fgets(src, 1024, fd)) { rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT); if (rc < 0) { fail++; } else { succ++; } } printf("success:%d fail:%d\n", succ, fail); fclose(fd); free(re); return 0;}
處理完所有資料的耗時是:
$time ./pcre_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]{2,}\.)$
success:6177443 fail:135
real0m8.257s
user0m8.194s
sys0m0.058s
2,regex
#include <stdio.h>#include <string.h>#include <regex.h>#define SUBSLEN 100 #define EBUFLEN 1280 #define BUFLEN 1024 int main(int argc, char *argv[]){ size_t len; regex_t re; regmatch_t subs[SUBSLEN]; char matched[BUFLEN]; char errbuf[EBUFLEN]; int err, i, succ=0, fail=0; FILE *fd; char *src; char line[1024]; char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]+\\.)$"; printf("Pattern: %s\n", pattern); if(regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE)) { len = regerror(err, &re, errbuf, sizeof(errbuf)); printf("error: regcomp: %s\n", errbuf); return 1; } if ((fd = fopen(argv[1], "r")) == NULL) { printf("open file error\n"); return 1; } while(fgets(line, 1024, fd)) { err = regexec(&re, line, (size_t) SUBSLEN, subs, 0); if (err == REG_NOMATCH) { fail++; } else { succ++; } } printf("success:%d, fails:%d\n", succ, fail); fclose(fd); regfree(&re); return (0);}
處理完所有資料耗時:
$time ./regex_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]+\.)$
success:6177443, fails:135
real0m50.876s
user0m50.783s
sys0m0.058s
3,結論。
可以看到,對於網域名稱驗證的情境。pcre明顯優於POSIX regex庫。在規則已經編譯好的情況下,pcre每秒大約處理74.8w條網域名稱,而regex每秒大約處理12.1萬條。