1.DFA algorithm
The principle of the DFA algorithm can be referred to here, simply to construct a sensitive word tree by map, each tree from the root node to the leaf node path constitutes a sensitive word, for example:
The code is simply implemented as follows:
public class Textfilterutil {//log private static final Logger log = Loggerfactory.getlogger (Textfilterutil.class); Sensitive thesaurus private static HashMap sensitivewordmap = null; Default encoding format private static final String ENCODING = "GBK"; The path of the sensitive Thesaurus private static final InputStream in = TextFilterUtil.class.getClassLoader (). getResourceAsStream ("sensitive/ KeyWords.txt "); /** * Initialize Sensitive thesaurus */private static void Init () {//Read file set<string> keyWords = Readsensitivewo RDS (); Create a sensitive thesaurus Sensitivewordmap = new hashmap<> (Keywords.size ()); for (String keyword:keywords) {Createkeyword (KeyWord); }}/** * Build Sensitive Thesaurus * * @param keyWord */private static void Createkeyword (String keyWord) { if (Sensitivewordmap = = null) {Log.error ("Sensitivewordmap not initialized!"); Return } Map nowmap = Sensitivewordmap; For (Character C:keyword.tochararray ()) { Object obj = Nowmap.get (c); if (obj = = null) {map<string, object> childmap = new hashmap<> (); Childmap.put ("Isend", "false"); Nowmap.put (c, Childmap); Nowmap = Childmap; } else {nowmap = (Map) obj; }} nowmap.put ("Isend", "true"); }/** * Read the sensitive Word file * * @return * * * private static set<string> readsensitivewords () {set< ; string> keyWords = new hashset<> (); BufferedReader reader = null; try {reader = new BufferedReader (new InputStreamReader (in, ENCODING)); String Line; while (line = Reader.readline ()) = null) {Keywords.add (Line.trim ()); }} catch (Unsupportedencodingexception e) {log.error ("Sensitive thesaurus file transcoding failed!"); } catch (FileNotFoundException e) {log.error ("The sensitive thesaurus file does not exist!"); } catch (IoexceptiOn e) {log.error ("sensitive thesaurus file read failed!"); } finally {if (reader! = null) {try {reader.close (); } catch (IOException e) {e.printstacktrace (); reader = null; }} return keyWords; }/** * Check for sensitive words * * @return */private static list<string> Checksensitiveword (String text) { if (Sensitivewordmap = = null) {init (); } list<string> sensitivewords = new arraylist<> (); Map nowmap = Sensitivewordmap; for (int i = 0; i < text.length (); i++) {Character word = Text.charat (i); Object obj = nowmap.get (word); if (obj = = null) {continue; } Int J = i + 1; Map Childmap = (map) obj; while (J < Text.length ()) {if ("true". Equals (Childmap.get ("Isend"))) { Sensitivewords.add (Text.substring (i, j)); } obj = Childmap.get (Text.charat (j)); if (obj! = null) {Childmap = (Map) obj; } else {break; } j + +; }} return sensitivewords; }}
2.ttmpAlgorithm
Ttmp algorithm by the Netizen original, about its origin can view here, ttmp algorithm principle is to divide sensitive words into "dirty word" sequence, only to be compared to the string completely by "dirty word" composition, only to determine whether it is a sensitive word, reduce the number of times. The simple implementation of this algorithm is as follows:
public class Textfilterutil {//log private static final Logger log = Loggerfactory.getlogger (Textfilterutil.class); Default encoding format private static final String ENCODING = "GBK"; The path of the sensitive Thesaurus private static final InputStream in = TextFilterUtil.class.getClassLoader (). getResourceAsStream ("sensitive/ KeyWords.txt "); Dirty font private static set<character> Sensitivecharset = null; Sensitive thesaurus private static set<string> sensitivewordset = null; /** * Initialize Sensitive thesaurus */private static void Init () {//Initialize container Sensitivecharset = new hashset<> (); Sensitivewordset = new hashset<> (); Read file creation Sensitive Thesaurus readsensitivewords (); /** * Read local sensitive Word file * * @return */private static void Readsensitivewords () {BufferedReader R Eader = null; try {reader = new BufferedReader (new InputStreamReader (in, ENCODING)); String Line; while (line = Reader.readline ()) = null) { String Word = Line.trim (); Sensitivewordset.add (word); For (Character C:word.tochararray ()) {Sensitivecharset.add (c); }}} catch (Unsupportedencodingexception e) {log.error ("Sensitive thesaurus file transcoding failed!"); } catch (FileNotFoundException e) {log.error ("The sensitive thesaurus file does not exist!"); } catch (IOException e) {log.error ("sensitive thesaurus file read failed!"); } finally {if (reader! = null) {try {reader.close (); } catch (IOException e) {e.printstacktrace (); reader = null; }} return; }/** * Check for sensitive words * * @return */private static list<string> Checksensitiveword (String text) { if (Sensitivewordset = = NULL | | sensitivecharset = = NULL) {init (); } list<string> sensitivewords = new arraylist<> (); for (int i = 0; i < text.length (); i++) {Character word = Text.charat (i); if (!sensitivecharset.contains (word)) {continue; } int j = i; while (J < Text.length ()) {if (!sensitivecharset.contains (word)) {break; } String key = Text.substring (i, j + 1); if (Sensitivewordset.contains (key)) {Sensitivewords.add (key); } j + +; }} return sensitivewords; }}
Note: The above code implementation is only used to display ideas, in the actual use there are many places to optimize.
[Java Web] filter algorithm for sensitive words