sphinx全文检索安装与应用|技术客 - 技术杂谈|IT经验分享

Sphinx是一个基于SQL的全文检索引擎，可以结合MySQL,PostgreSQL做全文搜索，

它可以提供比数据库本身更专业的搜索功能，使得应用程序更容易实现专业化的全文检索。

一、软件环境

centos7.2

sphinx3.1.1

二、安装

wget http://sphinxsearch.com/files/sphinx-3.1.1-612d99f-linux-amd64.tar.gz
tar -zxvf sphinx-3.1.1-612d99f-linux-amd64.tar.gz
cd sphinx-3.1.1
mkdir data
mkdir log

cd bin
touch sphinx.conf
vi sphinx.conf

source doc
{
    type            = mysql
    sql_host        = localhost
    sql_user        = sphinx_test
    sql_pass        = woaini8619
    sql_db          = sphinx_test
    sql_port        = 3306
    sql_query_pre   = SET NAMES utf8
    sql_query       = SELECT id, title, content FROM documents
    #sql_attr_uint       = group_id
    #sql_attr_timestamp  = date_added
}


index testindex
{
    source          = doc
    path            = /home/sphinx-3.1.1/data/testindex
    mlock         = 0
    min_word_len  = 2
    min_prefix_len = 0
    min_infix_len = 1
    ngram_len     = 1
    ngram_chars = U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, U+A492..U+A4C6
}

indexer
{
    mem_limit       = 128M
}


searchd
{
    listen          = 9312
    #listen                   = 9306:mysql41 #Used for SphinxQL
    log         = /home/sphinx-3.1.1/log/sphinx.log
    query_log       = /home/sphinx-3.1.1/log/query.log
    read_timeout        = 5
    max_children        = 30
    pid_file        = /home/sphinx-3.1.1/log/searchd.pid
    seamless_rotate     = 1
    preopen_indexes     = 1
    unlink_old      = 1
    workers         = threads # for RT to work
    binlog_path     = /home/sphinx-3.1.1/data
}

创建索引

#假设当前在sphinx的bin目录下
./indexer --all

启动

./searchd

三、PHP调用

将sphinx/api目录下的sphinxapi.php文件拷贝到站点目录中

cp bin/api/sphinxapi.php /wwwroot/default
touch index.php
vi index.php

<?php
require 'sphinxapi.php';
$sc = new SphinxClient();
$sc->setServer('localhost', 9312);
$keyword = '爱你';
$indexname = 'testindex';
$rs = $sc->query($keyword, $indexname);
var_dump($rs);

打印结果如图

QQ图片20181026230545.png

结果在matches下面，它的key就是对应数据库（我们这里是mysql）中的id，接着写示例代码

$ids   = $rs['matches'];
$id = array_keys($ids);
$id = implode(',',$id);

$conn = mysqli_connect('localhost','sphinx_test','woaini8619');
mysqli_select_db($conn, 'sphinx_test');
mysqli_query($conn,"set names utf8");

$list = [];
$res = mysqli_query($conn,"select * from documents where id in($id)");
while ($row=mysqli_fetch_row($res))
{
    $list[] = $row;
}

var_dump($list);

QQ图片20181026230854.png

这就是所需要的数据了，我们还可以对搜索关键词进行处理，比如描红

foreach($list as $v) {
    $row = $sc->buildExcerpts($v, $indexname, $keyword, [
        'before_match' => '<font color=red>',
        'after_match'  => '</font>'
    ]);

    var_dump($row);
}

QQ图片20181026231039.png

四、增量索引

如果这个时候数据库中新增或更新了数据，那么我们是不能检索出新的数据，怎么办呢，

不可能每次都重新生成索引，如果数据库数据量很大，这样效率是非常低的，

所以我们就需要添加一个增量数据源和增量索引，对新增加的数据单独建立索引，完成后

将其和原来的索引（主索引）合并即可。

我们需要新建一张表，保存主索引中最大ID的值，新增的数据肯定会大于这个保存的ID，

那么我们就知道哪些数据是需要新建索引的

create table counter （id int）;
insert into counter values(0);

vi sphinx.conf

source doc
{
    type            = mysql
    sql_host        = localhost
    sql_user        = sphinx_test
    sql_pass        = woaini8619
    sql_db          = sphinx_test
    sql_port        = 3306
    sql_query_pre   = SET NAMES utf8
    sql_query       = SELECT id, title, content FROM documents
    #sql_attr_uint       = group_id
    #sql_attr_timestamp  = date_added

    sql_query_post = update counter set id = (select max(id) from documents)
}

#增量索引数据源
source doc_zl : doc
{
    type            = mysql
    sql_host        = localhost
    sql_user        = sphinx_test
    sql_pass        = woaini8619
    sql_db          = sphinx_test
    sql_port        = 3306
    sql_query_pre   = SET NAMES utf8
    sql_query       = SELECT id, title, content FROM documents where id > (select id from counter)
    #sql_attr_uint       = group_id
    #sql_attr_timestamp  = date_added

    sql_query_post = update counter set id = (select max(id) from documents)
}

index testindex
{
    source          = doc
    path            = /home/sphinx-3.1.1/data/testindex
    mlock         = 0
    min_word_len  = 2
    min_prefix_len = 0
    min_infix_len = 1
    ngram_len     = 1
    ngram_chars = U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, U+A492..U+A4C6
}

index testindexzl : testindex
{
source          = doc_zl
    path            = /home/sphinx-3.1.1/data/testindex_zl
    mlock         = 0
    min_word_len  = 2
    min_prefix_len = 0
    min_infix_len = 1
    ngram_len     = 1
    ngram_chars = U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, U+A492..U+A4C6
}

强制重新生成增量索引

./indexer  testindexzl --rotate

合并主索引和增量索引

./indexer --merge testindex testindexzl --rotate

当然在实际环境中需要将这两个命令加入到cron中定时运行。

五、分布式

利用程序检索的时候，如果新增的数据还没有和主索引合并，那么检索主索引的时候是检索不到的，

(在程序中我们只能指定一个索引名，如果是在程序端将主索引搜索出来的数据和增量索引搜出来的数据进行合并,会加大工作量，很不方便)

这个时候我们怎么处理呢？我们可以利用分布式来处理。

我们在配置文件中加入一个新的分布式索引

vi sphinx.conf
index testindexmerge
{
    type = distributed
    local = testindex #本机
    local = testindexzl #本机
    #agent                   = 192.168.10.103:9313:myuser  //远程
    #agent_connect_timeout   = 1000
    #agent_query_timeout     = 3000
}

在程序中检索的时候指定的索引改为这个分布式的索引即可

$indexname = 'testindexmerge';

这里有个问题，描红的时候如果用这个索引名会返回false

$row = $sc->buildExcerpts($v, $indexname, $keyword, [
        'before_match' => '<font color=red>',
        'after_match'  => '</font>'
    ]);

我们可以这样处理，描红的时候重新指定下索引为主索引名

$row = $sc->buildExcerpts($v, 'testindex', $keyword, [
        'before_match' => '<font color=red>',
        'after_match'  => '</font>'
    ]);

六、多线程

searchd
{
    dist_threads = 2
}

sphinx全文检索安装与应用

访客评论

最新评论