TNTSearch GIT库:https://github.com/vanry/laravel-scout-tntsearch
jieba -php GIT库:https://github.com/fukuball/jieba-php
本文结合结巴分词进行简单介绍
一、安装
composer require vanry/laravel-scout-tntsearch composer require fukuball/jieba-php
二、范例
<?php
// composer require teamtnt/tntsearch
ini_set('memory_limit', '1024M');
require './vendor/autoload.php';
use TeamTNT\TNTSearch\TNTSearch;
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\JiebaAnalyse;
header('Content-type:text/html;charset=UTF-8');
$tnt = new TNTSearch;
$config = [
'storage' => 'E:/project/search/storage', //place where the index files will be stored
'fuzziness' => true,
'searchBoolean' => true,
'asYouType' => true,
'fuzzy' => [
'prefix_length' => 2,
'max_expansions' => 50,
'distance' => 2
],
'tokenizer' => [
'driver' => 'default',
'jieba' => [
'dict' => 'small',
]
],
'driver' => 'mysql',
'host' => '127.0.0.1',
'port' => 3306,
'database' => 'jblog',
'username' => 'root',
'password' => 'root',
//'unix_socket' => env('DB_SOCKET', ''),
'charset' => 'utf8mb4',
'collation' => 'utf8mb4_unicode_ci',
//'prefix' => 'lara_',
'strict' => true,
'engine' => null,
'stopwords' => [
'的',
'了',
'而是',
],
];
$tnt->loadConfig($config);为了让tntsearch支持中文,我们还需要修改一个文件TeamTNT\TNTSearch\Support\Tokenizer
<?php
namespace TeamTNT\TNTSearch\Support;
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
class Tokenizer implements TokenizerInterface
{
// public function tokenize($text, $stopwords = [])
// {
// $text = mb_strtolower($text);
// $split = preg_split("/[^\p{L}\p{N}]+/u", $text, -1, PREG_SPLIT_NO_EMPTY);
// return array_diff($split, $stopwords);
// }
public function __construct(array $options = [])
{
Jieba::init($options);
Finalseg::init($options);
}
public function tokenize($text, $stopwords = [])
{
return is_numeric($text) ? [] : $this->getTokens($text, $stopwords);
}
public function getTokens($text, $stopwords = [])
{
$split = Jieba::cutForSearch($text);
return $split;
}
}三、将数据库中数据生成索引
$indexer = $tnt->createIndex('jblog.index');
$indexer->includePrimaryKey();
$indexer->setPrimaryKey('id');
$indexer->query('SELECT id, title, excerpt FROM jblog_article;');
//$indexer->setLanguage('no');
$indexer->run();四、搜索
function getStopWords() {
$stopwords = [];
// txt文本中保存的是一些无意义的词,如 的、了、是等词,每个词占一行
$fp = fopen('E:/project/search/vendor/fukuball/jieba-php/src/dict/stop_words.txt', 'rb');
if ($fp) {
while (!feof($fp))
{
//$stopwords[] = fgets($fp);
array_push($stopwords, str_replace("\n", "", fgets($fp)));
}
fclose($fp);
}
return $stopwords;
}
function replaceWords($doc, $stopwords = [], $replace = ' ') {
return strtr(
$doc,
array_combine($stopwords, array_fill(0, count($stopwords), $replace))
);
}
Jieba::init();
Finalseg::init();
$keyword = '分别查询出每个关键词对应的文档ID';
$keyword = replaceWords($keyword, getStopWords());// 将一些无意义的词先过滤掉,例如:的、了、是等词
$seg_list = Jieba::cutForSearch($keyword);
$tnt->selectIndex("jblog.index");
$s = implode(' or ', $seg_list);
$res = $tnt->searchBoolean($s);
print_r($res);
fffffasdfasdfasdfasdfas;kdfjlaskl;dfjaskl;dfjakls;dfjkal;sfdjlaks;df als;dfj awkls;dfj asl;dfj aslk;dfjaskl;dfjas;ldf
访客评论