Today's project is carried out with the completion of Python, which needs to be used to do keyword checking, filter classification, and use the former C language to do this kind of thing. With clues, very efficient, memory is small, check fast.
Arriving in Python, the first idea is to pip the C language Python module based on the appearance. Unfortunately, we didn't find a suitable one, assuming I would write a Python module in C. I wrote one, but I still do not have the ability.
Only can be written in Python, performance is almost nearly, memory a little more does not matter.
Use the search engine to see the csdn on the net with Python implementation of the DFA, and then participate in the C language I have written in the dictionary tree. Some are not right, they wrote a. Imagine that the C language is very efficient and the space is very small.
A user's: DFA algorithm to implement sensitive word filtering (Python implementation)
Here's the Python code:
Class CNode (object):d EF __init__ (self): Self.children = none# The encode of Word are utf-8# the encode of message is UTF-8CL CDfa (object):d EF __init__ (self,lwords): Self.root=noneself.root=cnode () for SWord in LWords:self.addWord (SWord) # The encode of Word is utf-8def Addword (Self,word): node = Self.rootiend=len (word) -1for i in xrange (Len (word)): if Node.child ren = = None:node.children = {}if i!=iend:node.children[word[i]]= (CNode (), False) else:node.children[word[i]]= (CNode () , True) Elif Word[i] not in Node.children:if i!=iend:node.children[word[i]]= (CNode (), False) Else:node.children[word[i] ]= (CNode (), True) Else: #word [i] in node.children:if I==iend:next,bword=node.children[word[i]]node.children[word[i]] = (next,true) node=node.children[word[i]][0]def Iscontain (self,smsg): Root=self.rootilen=len (SMSG) for I in Xrange ( Ilen):p = ROOTJ = Iwhile (J<ilen and P.children!=none and Smsg[j] in P.children):(p,bword) = P.children[smsg[j]]if BWor D:return Truej = j + 1return falsedef filter (self,smsg): Lnew=[]rOot=self.rootilen=len (SMSG) i=0bcontinue=falsewhile i<ilen:p=rootj=iwhile (J<ilen and P.children!=None and SMSG [j] in P.children):(p,bword) = p.children[smsg[j]]if bword: #print smsg[i:j+1]lnew.append (U ' * ' * (j-i+1)) #keyword替换i =j+ 1bcontinue=truebreakj=j+1if bcontinue:bcontinue=falsecontinuelnew.append (Smsg[i]) I=i+1return ". Join (LNew)
The following is the C language code trie_tree.h:
#ifndef _trie_tree_h_included_#define _trie_tree_h_included_#define word_num 256struct trie_node {struct TRIE_ Node *node[word_num];int value;int exist;}; struct Trie_node *create_trie_node (int value), void Trie_tree_insert_word (struct trie_node *root, unsigned char *word); * Return 1 indicates presence, return 0 means no */int tire_word_is_exist (struct trie_node *root, unsigned char *word); void Destory_trie_tre E (struct trie_node *root), void Update_trie_tree (struct trie_node **root, const char *filename); #endif
TRIE_TREE.C:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <trie_tree.h>struct trie_ node *create_trie_node (int value) {struct Trie_node * node = calloc (1, sizeof (struct trie_node)); node->value = Value;re Turn node;} int tire_word_is_exist (struct trie_node *root, unsigned char *word) {struct Trie_node *n = null;unsigned char *p = null;if (Root = NULL) {return 0;} while (*word! = 0) {p = Word++;n = Root;while (*p! = 0) {n = n->node[*p];if (n = = NULL) {break;} else if (n->exist = = 1) {return 1;} p++;}} return 0;} void Trie_tree_insert_word (struct trie_node *root, unsigned char *word) {struct Trie_node *n;while (*word! = 0) {n = root- >node[*word];if (n = = NULL) {n = create_trie_node (*word); Root->node[*word] = n;} root = n;word++;} Root->exist = 1;} void Destroy_trie_tree (struct trie_node *root) {int i;if (root = NULL) {return;} for (i = 0; i < Word_num; i++) {Destroy_trie_tree (root->node[i]);} Free (root);} void Update_trie_tree (struct trie_node **roOT, const char *filename) {char word[1024]; FILE *fp;char *p;if (*root! = NULL) {destroy_trie_tree (*root);} *root = calloc (sizeof (**root), 1), fp = fopen (filename, "R"), if (fp = = NULL) {printf ("file can ' t open%s\n", filename); retur n;} while (fgets (Word, sizeof (word), FP)) {p = word;while (*p! = 0) {if (*p = = ' \ r ' | | *p = ' \ n ' | | *p = = ') {*p = 0;break; }p++;} Trie_tree_insert_word (*root, (unsigned char *) word);}}
Copyright notice: This article Bo Master original articles, blogs, without consent may not be reproduced.
DFA and trie for sensitive word filtering (Python and C language)