之前大家会发现,字段全部是用英文
其实我们用来进行搜索的,绝大多数,都是中文应用,很少做英文的 默认分词器standard:没有办法对中文进行合理分词的,只是将每个中文字符一个一个的切割开来,比如说中国人 --> 中 国 人
在搜索引擎领域,比较成熟和流行的中文分词起,就是ik分词器
中国人很喜欢吃油条
standard:中 国 人 很 喜 欢 吃 油 条 ik:中国人 很 喜欢 吃 油条
1、在elasticsearch中安装ik中文分词器因为使用es的版本是5.6.0
(1)git clone https://github.com/medcl/elasticsearch-analysis-ik (2)git checkout tags/v5.6.0 (3)mvn package
或者直接访问https://github.com/medcl/elasticsearch-analysis-ik/releases/tag/v5.6.0去下载
(4)将target/releases/elasticsearch-analysis-ik-5.6.0.zip拷贝到es/plugins/ik目录下
(5)在es/plugins/ik下对elasticsearch-analysis-ik-5.6.0.zip进行解压缩
(6)删掉压缩包,重启es
2、ik分词器基础知识两种analyzer:ik_max_word、ik_smart,你根据自己的需要自己选,但是一般是选用ik_max_word
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
ik_smart搜索共和国 --> 中华人民共和国和国歌,就搜不到了
3、ik分词器的使用删除索引my_index
DELETE /my_index
创建索引,设置text使用ik_max_word
PUT /my_index
{
"mappings": {
"my_type": {
"properties": {
"text": {
"type": "text",
"analyzer": "ik_max_word"
}
}
}
}
}
增加数据
POST /my_index/my_type/_bulk
{ "index": { "_id": "1"} }
{ "text": "男子偷上万元发红包求交女友 被抓获时仍然单身" }
{ "index": { "_id": "2"} }
{ "text": "16岁少女为结婚“变”22岁 7年后想离婚被法院拒绝" }
{ "index": { "_id": "3"} }
{ "text": "深圳女孩骑车逆行撞奔驰 遭索赔被吓哭(图)" }
{ "index": { "_id": "4"} }
{ "text": "女人对护肤品比对男票好?网友神怼" }
{ "index": { "_id": "5"} }
{ "text": "为什么国内的街道招牌用的都是红黄配?" }
响应结果
{
"took": 61,
"errors": false,
"items": [
{
"index": {
"_index": "my_index",
"_type": "my_type",
"_id": "1",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"created": true,
"status": 201
}
},
{
"index": {
"_index": "my_index",
"_type": "my_type",
"_id": "2",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"created": true,
"status": 201
}
},
{
"index": {
"_index": "my_index",
"_type": "my_type",
"_id": "3",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"created": true,
"status": 201
}
},
{
"index": {
"_index": "my_index",
"_type": "my_type",
"_id": "4",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"created": true,
"status": 201
}
},
{
"index": {
"_index": "my_index",
"_type": "my_type",
"_id": "5",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"created": true,
"status": 201
}
}
]
}
测试分词器
GET /my_index/_analyze
{
"text": "男子偷上万元发红包求交女友 被抓获时仍然单身",
"analyzer": "ik_max_word"
}
响应结果
{
"tokens": [
{
"token": "男子",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "偷上",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 1
},
{
"token": "上万",
"start_offset": 3,
"end_offset": 5,
"type": "CN_WORD",
"position": 2
},
{
"token": "万元",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 3
},
{
"token": "万",
"start_offset": 4,
"end_offset": 5,
"type": "TYPE_CNUM",
"position": 4
},
{
"token": "元",
"start_offset": 5,
"end_offset": 6,
"type": "COUNT",
"position": 5
},
{
"token": "发红包",
"start_offset": 6,
"end_offset": 9,
"type": "CN_WORD",
"position": 6
},
{
"token": "发红",
"start_offset": 6,
"end_offset": 8,
"type": "CN_WORD",
"position": 7
},
{
"token": "红包",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "求",
"start_offset": 9,
"end_offset": 10,
"type": "CN_CHAR",
"position": 9
},
{
"token": "交",
"start_offset": 10,
"end_offset": 11,
"type": "CN_CHAR",
"position": 10
},
{
"token": "女友",
"start_offset": 11,
"end_offset": 13,
"type": "CN_WORD",
"position": 11
},
{
"token": "被",
"start_offset": 14,
"end_offset": 15,
"type": "CN_CHAR",
"position": 12
},
{
"token": "抓获",
"start_offset": 15,
"end_offset": 17,
"type": "CN_WORD",
"position": 13
},
{
"token": "时",
"start_offset": 17,
"end_offset": 18,
"type": "CN_CHAR",
"position": 14
},
{
"token": "仍然",
"start_offset": 18,
"end_offset": 20,
"type": "CN_WORD",
"position": 15
},
{
"token": "单身",
"start_offset": 20,
"end_offset": 22,
"type": "CN_WORD",
"position": 16
}
]
}
查询
GET /my_index/my_type/_search
{
"query": {
"match": {
"text": "16岁少女结婚好还是单身好?"
}
}
}
响应结果
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 2.6093416,
"hits": [
{
"_index": "my_index",
"_type": "my_type",
"_id": "2",
"_score": 2.6093416,
"_source": {
"text": "16岁少女为结婚“变”22岁 7年后想离婚被法院拒绝"
}
},
{
"_index": "my_index",
"_type": "my_type",
"_id": "4",
"_score": 1.3300087,
"_source": {
"text": "女人对护肤品比对男票好?网友神怼"
}
},
{
"_index": "my_index",
"_type": "my_type",
"_id": "1",
"_score": 0.26301134,
"_source": {
"text": "男子偷上万元发红包求交女友 被抓获时仍然单身"
}
}
]
}
}