TNTSearch GIT库:https://github.com/vanry/laravel-scout-tntsearch
jieba -php GIT库:https://github.com/fukuball/jieba-php
本文结合结巴分词进行简单介绍
一、安装
composer require vanry/laravel-scout-tntsearch composer require fukuball/jieba-php
二、范例
<?php // composer require teamtnt/tntsearch ini_set('memory_limit', '1024M'); require './vendor/autoload.php'; use TeamTNT\TNTSearch\TNTSearch; use Fukuball\Jieba\Jieba; use Fukuball\Jieba\Finalseg; use Fukuball\Jieba\JiebaAnalyse; header('Content-type:text/html;charset=UTF-8'); $tnt = new TNTSearch; $config = [ 'storage' => 'E:/project/search/storage', //place where the index files will be stored 'fuzziness' => true, 'searchBoolean' => true, 'asYouType' => true, 'fuzzy' => [ 'prefix_length' => 2, 'max_expansions' => 50, 'distance' => 2 ], 'tokenizer' => [ 'driver' => 'default', 'jieba' => [ 'dict' => 'small', ] ], 'driver' => 'mysql', 'host' => '127.0.0.1', 'port' => 3306, 'database' => 'jblog', 'username' => 'root', 'password' => 'root', //'unix_socket' => env('DB_SOCKET', ''), 'charset' => 'utf8mb4', 'collation' => 'utf8mb4_unicode_ci', //'prefix' => 'lara_', 'strict' => true, 'engine' => null, 'stopwords' => [ '的', '了', '而是', ], ]; $tnt->loadConfig($config);
为了让tntsearch支持中文,我们还需要修改一个文件TeamTNT\TNTSearch\Support\Tokenizer
<?php namespace TeamTNT\TNTSearch\Support; use Fukuball\Jieba\Jieba; use Fukuball\Jieba\Finalseg; class Tokenizer implements TokenizerInterface { // public function tokenize($text, $stopwords = []) // { // $text = mb_strtolower($text); // $split = preg_split("/[^\p{L}\p{N}]+/u", $text, -1, PREG_SPLIT_NO_EMPTY); // return array_diff($split, $stopwords); // } public function __construct(array $options = []) { Jieba::init($options); Finalseg::init($options); } public function tokenize($text, $stopwords = []) { return is_numeric($text) ? [] : $this->getTokens($text, $stopwords); } public function getTokens($text, $stopwords = []) { $split = Jieba::cutForSearch($text); return $split; } }
三、将数据库中数据生成索引
$indexer = $tnt->createIndex('jblog.index'); $indexer->includePrimaryKey(); $indexer->setPrimaryKey('id'); $indexer->query('SELECT id, title, excerpt FROM jblog_article;'); //$indexer->setLanguage('no'); $indexer->run();
四、搜索
function getStopWords() { $stopwords = []; // txt文本中保存的是一些无意义的词,如 的、了、是等词,每个词占一行 $fp = fopen('E:/project/search/vendor/fukuball/jieba-php/src/dict/stop_words.txt', 'rb'); if ($fp) { while (!feof($fp)) { //$stopwords[] = fgets($fp); array_push($stopwords, str_replace("\n", "", fgets($fp))); } fclose($fp); } return $stopwords; } function replaceWords($doc, $stopwords = [], $replace = ' ') { return strtr( $doc, array_combine($stopwords, array_fill(0, count($stopwords), $replace)) ); } Jieba::init(); Finalseg::init(); $keyword = '分别查询出每个关键词对应的文档ID'; $keyword = replaceWords($keyword, getStopWords());// 将一些无意义的词先过滤掉,例如:的、了、是等词 $seg_list = Jieba::cutForSearch($keyword); $tnt->selectIndex("jblog.index"); $s = implode(' or ', $seg_list); $res = $tnt->searchBoolean($s); print_r($res);
访客评论