提取Python stringlib中的”BMHBNFS”字串尋找演算法

來源:互聯網
上載者:User

Python中的stringlib字串尋找演算法是Boyer-Moore, Horspool, Sunday, Bloom Filter幾種演算法的合成體, 大概的原理如下:

def find(s, p):    # find first occurrence of p in s    n = len(s)    m = len(p)    skip = delta1(p)[p[m-1]]    i = 0    while i <= n-m:        if s[i+m-1] == p[m-1]: # (boyer-moore)            # potential match            if s[i:i+m-1] == p[:m-1]:                return i            if s[i+m] not in p:                i = i + m + 1 # (sunday)            else:                i = i + skip # (horspool)        else:            # skip            if s[i+m] not in p:                i = i + m + 1 # (sunday)            else:                i = i + 1    return -1 # not found

以下是具體實現:

/* stringlib: fastsearch implementation */#ifndef STRINGLIB_FASTSEARCH_H#define STRINGLIB_FASTSEARCH_H#include <stdint.h>#include <unistd.h>#include <stdio.h>#include <stdlib.h>#include <string.h>/* fast search/count implementation, based on a mix between boyer-   moore and horspool, with a few more bells and whistles on the top.   for some more background, see: http://effbot.org/zone/stringlib.htm *//* note: fastsearch may access s[n], which isn't a problem when using   Python's ordinary string types, but may cause problems if you're   using this code in other contexts.  also, the count mode returns -1   if there cannot possible be a match in the target string, and 0 if   it has actually checked for matches, but didn't find any.  callers   beware! */#define FAST_COUNT 0#define FAST_SEARCH 1#define FAST_RSEARCH 2#ifndef LONG_BIT#define LONG_BIT 32#endif#if LONG_BIT >= 128#define STRINGLIB_BLOOM_WIDTH 128#elif LONG_BIT >= 64#define STRINGLIB_BLOOM_WIDTH 64#elif LONG_BIT >= 32#define STRINGLIB_BLOOM_WIDTH 32#else#error "LONG_BIT is smaller than 32"#endif#define STRINGLIB_BLOOM_ADD(mask, ch) \((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))#define STRINGLIB_BLOOM(mask, ch)     \((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))ssize_t fastsearch(const char *s, ssize_t n,   const char *p, ssize_t m,   ssize_t maxcount, int mode){unsigned long mask;ssize_t skip, count = 0;ssize_t i, j, mlast, w;w = n - m;if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) {return -1;}/* look for special cases */if (m <= 1) {if (m <= 0) {return -1;}/* use special case for 1-character strings */if (mode == FAST_COUNT) {for (i = 0; i < n; i++)if (s[i] == p[0]) {count++;if (count == maxcount) {return maxcount;}}return count;}else if (mode == FAST_SEARCH) {for (i = 0; i < n; i++)if (s[i] == p[0]) {return i;}}else {      /* FAST_RSEARCH */for (i = n - 1; i > -1; i--)if (s[i] == p[0]) {return i;}}return -1;}mlast = m - 1;skip = mlast - 1;mask = 0;if (mode != FAST_RSEARCH) {/* create compressed boyer-moore delta 1 table *//* process pattern[:-1] */for (i = 0; i < mlast; i++) {STRINGLIB_BLOOM_ADD(mask, p[i]);if (p[i] == p[mlast]) {skip = mlast - i - 1;}}/* process pattern[-1] outside the loop */STRINGLIB_BLOOM_ADD(mask, p[mlast]);for (i = 0; i <= w; i++) {/* note: using mlast in the skip path slows things down on x86 */if (s[i + m - 1] == p[m - 1]) {/* candidate match */for (j = 0; j < mlast; j++)if (s[i + j] != p[j]) {break;}if (j == mlast) {/* got a match! */if (mode != FAST_COUNT) {return i;}count++;if (count == maxcount) {return maxcount;}i = i + mlast;continue;}/* miss: check if next character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i + m])) {i = i + m;}else {i = i + skip;}}else {/* skip: check if next character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i + m])) {i = i + m;}}}}else {      /* FAST_RSEARCH *//* create compressed boyer-moore delta 1 table *//* process pattern[0] outside the loop */STRINGLIB_BLOOM_ADD(mask, p[0]);/* process pattern[:0:-1] */for (i = mlast; i > 0; i--) {STRINGLIB_BLOOM_ADD(mask, p[i]);if (p[i] == p[0]) {skip = i - 1;}}for (i = w; i >= 0; i--) {if (s[i] == p[0]) {/* candidate match */for (j = mlast; j > 0; j--)if (s[i + j] != p[j]) {break;}if (j == 0)/* got a match! */{return i;}/* miss: check if previous character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i - 1])) {i = i - m;}else {i = i - skip;}}else {/* skip: check if previous character is part of pattern */if (!STRINGLIB_BLOOM(mask, s[i - 1])) {i = i - m;}}}}if (mode != FAST_COUNT) {return -1;}return count;}#endif

測試代碼

#include <arch/cycle.h>int main(int argc, char **argv){char *str = "GET / HTTP 1.0\r\nHost: www.xxx.com\r\nCache: \r\nCache:\r\n Length:\r\n";ssize_t rc = 0;uint64_t start, end;start = get_cycle_count();rc = fastsearch(str, strlen(str), "Cache:", 6, 2, FAST_SEARCH);end = get_cycle_count();printf("fastsearch return %u cost %llu \n", rc, end - start);printf("result = %s\n", str + rc);rc = fastsearch(str, strlen(str), "Cache:", 6, -1, FAST_COUNT);printf("result = %u\n", rc);return 0;}

看stringlib測試資料, 還是蠻可以的.

我在tile平台上測試發現還沒有snort中的BMH演算法速度快.

不過這個只是單一測試, 沒有考慮到cache的情況, 僅供參考.

原文參考:

The stringlib Library 有詳細的描敘.

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.