Lucene.net 2.0 syntax highlighting after Chinese Word Segmentation
When e.net 2.0 SRC contains highlighter. Net-2.0.0, which can be used for syntax highlighting.
// Define a multi-condition search Analyzer
Booleanquery bquery = new booleanquery ();
// Define the word Divider
Analyzer analyzer = new standardanalyzer ();
// Multi-condition search Splitter
Multifieldqueryparser parser = new multifieldqueryparser (New String [] {"title", "content"}, analyzer );
Query = parser. parse (key );
Bquery. Add (query, booleanclause. occur. Must );
Datetime now = datetime. now;
// Initialize the searcher
// Implement distributed search
List <indexreader> indexreaders = new list <indexreader> ();
String [] dirs = directory. getdirectories (DIR );
If (searchtype = searchtype. None)
{
Foreach (string item in dirs)
{
// System. Web. httpcontext. Current. response. Write (item );
Indexreaders. Add (indexreader. Open (path. Combine (path. Combine (Dir, item), "save ")));
}
}
Else
{
// System. Web. httpcontext. Current. response. Write (searchtype. tostring ());
Indexreaders. Add (indexreader. Open (path. Combine (path. Combine (Dir, searchtype. tostring (), "save ")));
}
Multireader reader = new multireader (indexreaders. toarray ());
Indexsearcher = new indexsearcher (Reader );
Hits hits = NULL;
Hits = indexsearcher. Search (bquery );
Timer = (datetime. Now-Now). totalmilliseconds;
Int COUNT = hits. Length ();
/* Calculate the displayed entries */
Int start = (pageno-1) * 10;
Int end = pageno * 10> count? Count: pageno * 10;
// Response. Write (readerhelper. myquery. tostring ());
/* Syntax highlighting settings */
Highlighter = new highlighter (New queryscorer (query ));
Highlighter. settextfragmenter (New simplefragmenter (100 ));
For (INT I = start; I <end; I ++)
{
Lucene. net. Documents. Document Doc = hits. DOC (I );
System. String text = Doc. Get ("content ");
// Add the end to ensure that special characters at the end are not filtered
String title = Doc. Get ("title") + "+ aaaaaaaaa ";
Lucene. net. analysis. tokenstream = analyzer. tokenstream ("content", new system. Io. stringreader (text ));
Lucene. net. analysis. tokenstream titkestream = analyzer. tokenstream ("title", new system. Io. stringreader (title ));
System. String result = highlighter. getbestfragments (tokenstream, text, 2 ,"...");
String tresult = highlighter. getbestfragments (titkestream, title, 0 ,"..");
// Remove the ending mark of the title
If (tresult. length> 10)
Tresult = tresult. Remove (tresult. Length-10, 10 );
If (string. isnullorempty (tresult ))
Tresult = title. Remove (title. Length-10, 10 );
// Read unlabeled content
If (string. isnullorempty (result ))
{
If (text. length> 100)
Result = text. substring (0,100 );
Else
Result = text;
}
If (result. Length <text. length)
Result = Result + "...";
}
This is the built-in word divider standardanalyzer using mongoe.net. One drawback is that a word is considered a word. Now we need to use myanalyzer, our Chinese word segmentation tool. Highlighter always reports an error. Why is this happening? That's because the Chinese word divider generally has a separator after word segmentation. For example, for the word "Hujiang English", after the division is complete, it becomes "Hujiang English ". That is to say, the returned token is (Hujiang,) (English ). If highlighter is used, it wants to obtain the word "(Hujiang,) (English,)". This is the deviation caused by space and causes the highlighter error. It is troublesome to modify highlighter, and it is difficult to perform special processing on the word divider. You can use the dictionary interpreter to split the result and then highlight it. For example, you can obtain the word segmentation result "Hujiang English" and then pass the word segmentation result to highlighter. The disadvantage is that there will be many spaces in the search results out of thin air. There is a natural solution for lazy people. Myanalyzer is used for indexing and searching, and standardanalyzer is used for highlighting:
// Define a multi-condition search Analyzer
Booleanquery bquery = new booleanquery ();
// Define the word Divider
Analyzer analyzer = new myanalyzer ();
Analyzer highanalyzer = new standardanalyzer ();
Multifieldqueryparser parser = new multifieldqueryparser (New String [] {"title", "content"}, highanalyzer );
Query highquery = parser. parse (key );
// Multi-condition search Splitter
Multifieldqueryparser parser = new multifieldqueryparser (New String [] {"title", "content"}, analyzer );
Query = parser. parse (key );
Bquery. Add (query, booleanclause. occur. Must );
Datetime now = datetime. now;
// Initialize the searcher
// Implement distributed search
List <indexreader> indexreaders = new list <indexreader> ();
String [] dirs = directory. getdirectories (DIR );
If (searchtype = searchtype. None)
{
Foreach (string item in dirs)
{
// System. Web. httpcontext. Current. response. Write (item );
Indexreaders. Add (indexreader. Open (path. Combine (path. Combine (Dir, item), "save ")));
}
}
Else
{
// System. Web. httpcontext. Current. response. Write (searchtype. tostring ());
Indexreaders. Add (indexreader. Open (path. Combine (path. Combine (Dir, searchtype. tostring (), "save ")));
}
Multireader reader = new multireader (indexreaders. toarray ());
Indexsearcher = new indexsearcher (Reader );
hits = NULL;
hits = indexsearcher. Search (bquery);
timer = (datetime. Now-Now). totalmilliseconds;
Int COUNT = hits. Length ();
/* Calculate the displayed entries */
Int start = (pageno-1) * 10;
Int end = pageno * 10> count? Count: pageno * 10;
// Response. Write (readerhelper. myquery. tostring ());
/* Syntax highlighting settings */
Highlighter = new highlighter (New queryscorer (highquery ));
Highlighter. settextfragmenter (New simplefragmenter (100 ));
For (INT I = start; I <end; I ++)
{
Lucene. net. Documents. Document Doc = hits. DOC (I );
System. String text = Doc. Get ("content ");
// Add the end to ensure that special characters at the end are not filtered
String title = Doc. Get ("title") + "+ aaaaaaaaa ";
Lucene. net. analysis. tokenstream = highanalyzer. tokenstream ("content", new system. Io. stringreader (text ));
Lucene. net. analysis. tokenstream titkestream = highanalyzer. tokenstream ("title", new system. Io. stringreader (title ));
System. String result = highlighter. getbestfragments (tokenstream, text, 2 ,"...");
String tresult = highlighter. getbestfragments (titkestream, title, 0 ,"..");
// Remove the ending mark of the title
If (tresult. length> 10)
Tresult = tresult. Remove (tresult. Length-10, 10 );
If (string. isnullorempty (tresult ))
Tresult = title. Remove (title. Length-10, 10 );
// Read unlabeled content
If (string. isnullorempty (result ))
{
If (text. length> 100)
Result = text. substring (0,100 );
Else
Result = text;
}
If (result. Length <text. length)
Result = Result + "...";
}
The result is very friendly, for example:
Http://so.yeshj.com/so.aspx? Key = % E6 % B2 % Aa % E6 % B1 % 9f % E6 % 97% a5 % E8 % af % ad & H = % E6 % B2 % Aa % E6 % B1 % 9f % E6 % 97% a5 % E8 % af % ad
The full text is complete.