The application of Sphinx Chinese word segmentation in discuz.
Sphinx-for-chinese is a full-text retrieval software focused on Chinese search, adding Chinese processing modules and optimizing Chinese search results on the basis of Sphinx. In comparison, in the case of a forum tens of millions of data environment, than the Coreseek package of Sphinx Chinese word +mmseg3 combination better
1. Preparing the Environment
Cd/var/tmp/wget Http://www.sphinx-search.com/downloads/sphinx-for-chinese-2.2.1-dev-r4311.tar.gzwget/HTTP Docs.zjyxh.com/discuzx/sphinx-for-chinese/xdictmkdir-p/data0/dzbbs/indexdata/mkdir-p/data1/dzbbs/indexdata/
2. Install Sphinx for Chinese
Cxx=gcccflags= "-o3-fomit-frame-pointer-pipe-march=nocona-mfpmath=sse-m128bit-long-double-mmmx-msse-msse2- Maccumulate-outgoing-args-m64-ftree-loop-linear-fprefetch-loop-arrays-freg-struct-return-fgcse-sm-fgcse-las- Frename-registers-fforce-addr-fivopts-ftree-vectorize-ftracer-frename-registers-minline-all-stringops- Fbranch-target-load-optimize2 "cxxflags=" ${cflags} "Export CXX cxxflags Cflagstar ZXF SPHINX-FOR-CHINESE-2.2.1-DEV-R4311.TAR.GZCD sphinx-for-chinese-2.2.1-dev-r4311./configure--prefix=/usr/local/ sfc22/--with-mysql=/usr/local/mysql/--enable-id64make-j8 Install
3. Configure Sphinx for Chinese
cd/usr/local/sfc22/etc/
MV Xdict Xdictbak
Cp-a/var/tmp/xdict.
Vim sphinx.conf
SOURCE Pre_forum_thread
{
Type = MySQL
Sql_host = localhost
Sql_user = Sphinx
Sql_pass = sphinx2013
sql_db = Discuzx
Sql_port = 3306
Sql_query_pre = SET NAMES UTF8
Sql_query_pre = SET SESSION Query_cache_type=off
Sql_query_pre = REPLACE into Pre_common_sphinxcounter SELECT 1, MAX (TID) from Pre_forum_thread
Sql_query = SELECT T.tid as id,t.tid,t.subject,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
From Pre_forum_thread as t \
WHERE t.tid>= $start and t.tid<= $end
Sql_query_range = Select (select MIN (tid) from Pre_forum_thread), Maxid from Pre_common_sphinxcounter WHERE Inde Xid=1
Sql_range_step = 5000
Sql_attr_uint = Tid
Sql_attr_uint = Digest
Sql_attr_uint = Displayorder
Sql_attr_uint = Authorid
Sql_attr_uint = Special
Sql_attr_timestamp =lastpost
Sql_query_info = SELECT * from Pre_forum_thread WHERE tid= $id
}
#threads
Index Pre_forum_thread
{
Source = Pre_forum_thread
Path =/data0/dzbbs/indexdata/pre_forum_thread #windows下最好用全路径
DocInfo = extern
Mlock = 0
Morphology = None
#charset_dictpath =/usr/local/mmseg32/etc/#BSD, settings under Linux,/end of symbol
#charset_dictpath = etc/#Windows环境下设置,/end of symbol
#charset_debug = 0
# # # Words Minimum length of index
Min_word_len = 2
#charset_type = Zh_cn.utf-8
Html_strip = 1
##### character form, note: If you use this method, Sphinx will split the Chinese
##### is the word index, to use Chinese word segmentation, you must use other word breakers such as CORESEEK,SFC
Charset_type = Utf-8
Chinese_dictionary =/usr/local/sfc22/etc/xdict
Min_prefix_len = 0
Min_infix_len = 1
Ngram_len = 0
}
#threads_minute
SOURCE Pre_forum_thread_minute:pre_forum_thread
{
#sql_query_pre =
Sql_query_pre = SET NAMES UTF8
Sql_query_pre = SET SESSION Query_cache_type=off
Sql_query_range = Select Maxid-1, (select MAX (tid) from Pre_forum_thread) from Pre_common_sphinxcounter WHERE Inde Xid=1
}
#threads_minute
Index Pre_forum_thread_minute:pre_forum_thread
{
Source = Pre_forum_thread_minute
Path =/data0/dzbbs/indexdata/pre_forum_thread_minute #windows下最好用全路径
}
#posts
SOURCE Pre_forum_post:pre_forum_thread
{
Type = MySQL
Sql_query_pre =
Sql_query_pre = SET NAMES UTF8
Sql_query_pre = SET SESSION Query_cache_type=off
Sql_query_pre = REPLACE into Pre_common_sphinxcounter SELECT 2, MAX (PID) from Pre_forum_post
Sql_query = SELECT P.pid as Id,p.tid,p.subject,p.message,t.digest,t.displayorder,t.authorid,t.lastpost,t.sp Ecial \
From Pre_forum_post as P left JOIN pre_forum_thread as T USING (TID) where P.pid >= $start and p.pid <= $end \
and P.first=1
Sql_query_range = Select (select MIN (PID) from Pre_forum_post), Maxid from Pre_common_sphinxcounter WHERE indexid= 2
Sql_range_step = 5000
Sql_attr_uint = Tid
Sql_attr_uint = Digest
Sql_attr_uint = Displayorder
Sql_attr_uint = Authorid
Sql_attr_uint = Special
Sql_attr_timestamp = Lastpost
Sql_query_info = SELECT * from Pre_forum_post WHERE pid= $id
}
#posts
Index Pre_forum_post
{
Source = Pre_forum_post
Path =/data1/dzbbs/indexdata/pre_forum_post #windows下最好用全路径
DocInfo = extern
Mlock = 0
Morphology = None
#charset_dictpath =/usr/local/mmseg32/etc/#BSD, settings under Linux,/end of symbol
#charset_dictpath = etc/#Windows环境下设置,/end of symbol
#charset_debug = 0
# # # Words Minimum length of index
Min_word_len = 2
#charset_type = Zh_cn.utf-8
Html_strip = 0
##### character form, note: If you use this method, Sphinx will split the Chinese
##### is the word index, to use Chinese word segmentation, you must use other word breakers such as CORESEEK,SFC
Charset_type = Utf-8
Chinese_dictionary =/usr/local/sfc22/etc/xdict
Min_prefix_len = 0
Min_infix_len = 1
Ngram_len = 0
}
#pre_forum_post_minute
SOURCE Pre_forum_post_minute:pre_forum_post
{
Sql_query_pre = SET NAMES UTF8
Sql_query_pre = SET SESSION Query_cache_type=off
Sql_query_range = Select Maxid-1, (select MAX (PID) from Pre_forum_post) from Pre_common_sphinxcounter WHERE Inde xid=2
}
#pre_forum_post_minute
Index Pre_forum_post_minute:pre_forum_post
{
Source = Pre_forum_post
Path =/data0/dzbbs/indexdata/pre_forum_post #windows下最好用全路径
}
#全局indexer定义
Indexer
{
Mem_limit = 2047M
Write_buffer = 64M
}
#searchd服务定义
Searchd
{
Listen = 3312
Read_timeout = 5
Max_children = 30
Max_matches = 500
Listen_backlog = 20
Seamless_rotate = 0
preopen_indexes = 0
Max_filter_values = 16384
Max_filters = 1024
Mva_updates_pool = 16M
Max_packet_size = 32M
Read_buffer = 1M
Unlink_old = 1
Pid_file =/usr/local/sfc22/var/log/searchd_discuzx.pid #windows下最好用全路径
Log =/usr/local/sfc22/var/log/searchd_discuzx.log #windows下最好用全路径
Query_log =/usr/local/sfc22/var/log/query_discuzx.log #windows下最好用全路径
}
4 Test SFC Chinese word segmentation effect.
/usr/local/sfc22/bin/search-c/usr/local/sfc22/etc/sphinx.conf share the wonderful
Words
1. ' Share ': 194 documents, 266 hits
2. ' Wonderful ': 368 documents, 425 hits
3. ' Wonderful video ': 2 documents, 2 hits
4. ' Wonderful show ': 2 documents, 2 hits
5. ' Fabulous ': 2 documents, 2 hits
6. ' Highlights ': 7 documents, 7 hits
7. ' Wonderful moments ': 5 documents, 5 hits
8. ' Wonderful review ': 2 documents, 3 hits
It can be seen that SFC Chinese word segmentation effect is more accurate.
5 Starting SFC
/usr/local/sfc22/bin/searchd-c/usr/local/sfc22/etc/sphinx.conf
6 writing the primary and incremental index scripts
1) write the main and incremental indexes of the subject table and the increment index of the post table, run every 20 minutes.
vim/usr/local/bin/bbssearch.sh
#!/bin/bash#desc: this scripts for sphinx for chinese index#date:2014.12.23 #testd in centos 6.5 x86_64#saved in /usr/local/bin/bbssearch.sh#written by [email protected] coralzd.blog.51cto.com blog.zjyxh.comexport path=/usr/local/sfc22/ bin/:/sbin/: $PATHY =$ (date +%y) m=$ (date +%m) d=$ (date +%d) # create thread merge indexecho "-- thread main index start ' date +${y}-${m}-${d}.%h:%m:%s ' --" >> /var/log/sphinx.logindexer pre_forum_thread_minute --rotateecho "-- thread merge index end ' date +${y}-${m}-${d}.%h:%m:%s ' --" >> /var/log/sphinx.log# create post merge indexecho "-- post main index start ' date +${y}-$m-$d.%h:%m:%s ' --' >> /var/ Log/sphinx.logindexer pre_forum_post_minute --rotateecho "-- post merge index end ' date +${y}-$m-$d.%h:%m:%s ' --" >> /var/log/sphinx.log# main index + merge index threadecho "-- thread main merge + index start ' date +${y}-${m}-${d}.%h:%m:%s ' --" >> /var/log/ sphinx.logindexer --merge pre_forum_thread pre_forum_thread_minute --merge-dst-range deleted 0 0 --rotateecho "-- thread main merge + index end ' date +${y}-${m}-${d}.%h:%m:%s ' --" >> /var/log/sphinx.log
2) write the main index of the post table and run it once a week.
vim/usr/local/bin/bbssearch2.sh
#!/bin/bash#desc:this scripts for Sphinx-Chinese index#date:2014.12.23#testd in CentOS 6.5 x86_64#saved in/usr/local /bin/bbssearch2.sh#written by [email protected] coralzd.blog.51cto.com Blog.zjyxh.comexport path=/usr/local/sfc22/ bin/:/sbin/: $PATHY =$ (date +%y) m=$ (date +%m) d=$ (date +%d) # Create post Merge Indexecho "--post main index start ' date +${y }-$m-$d.%h:%m:%s '--">>/var/log/sphinx.logindexer pre_forum_post--rotateecho"--post merge index end ' date +${y }-$m-$d.%h:%m:%s '--">>/var/log/sphinx.log
Put it on the crontab scheduled task.
This article is from the "Tri Xiaohui blog" blog, make sure to keep this source http://coralzd.blog.51cto.com/90341/1593907
The application of Sphinx Chinese word segmentation in Discuz