16深度探秘搜索技术_使用copy_to定制组合field解决cross-fields搜索弊端

Dongguo丶发布时间：2021-11-20 12:58:55 ，浏览量：5

用most_fields策略，去实现cross-fields搜索，有3大弊端，而且搜索结果也显示出了这3大弊端，如何解决这些弊端呢？

第一个办法：用copy_to，将多个field组合成一个field

问题其实就出在有多个field，有多个field以后，我们只要想办法将一个标识跨在多个field的情况，合并成一个field即可。比如说，一个人名，本来是first_name，last_name，现在合并成一个full_name。

新增field映射new_author_first_name、new_author_first_name、new_author_full_name

PUT /forum/_mapping/article
{
  "properties": {
      "new_author_first_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      },
      "new_author_last_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      },
      "new_author_full_name": {
          "type":     "text"
      }
  }
}

用了这个copy_to语法之后，就可以将多个字段的值拷贝到一个字段中，并建立倒排索引

POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"new_author_first_name" : "Peter", "new_author_last_name" : "Smith"} }		
{ "update": { "_id": "2"} }	
{ "doc" : {"new_author_first_name" : "Smith", "new_author_last_name" : "Williams"} }		
{ "update": { "_id": "3"} }
{ "doc" : {"new_author_first_name" : "Jack", "new_author_last_name" : "Ma"} }		
{ "update": { "_id": "4"} }
{ "doc" : {"new_author_first_name" : "Robbin", "new_author_last_name" : "Li"} }			
{ "update": { "_id": "5"} }
{ "doc" : {"new_author_first_name" : "Tonny", "new_author_last_name" : "Peter Smith"} }

响应结果

{
  "took": 20,
  "errors": false,
  "items": [
    {
      "update": {
        "_index": "forum",
        "_type": "article",
        "_id": "1",
        "_version": 9,
        "result": "noop",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "status": 200
      }
    },
    {
      "update": {
        "_index": "forum",
        "_type": "article",
        "_id": "2",
        "_version": 13,
        "result": "updated",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "status": 200
      }
    },
    {
      "update": {
        "_index": "forum",
        "_type": "article",
        "_id": "3",
        "_version": 9,
        "result": "updated",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "status": 200
      }
    },
    {
      "update": {
        "_index": "forum",
        "_type": "article",
        "_id": "4",
        "_version": 13,
        "result": "updated",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "status": 200
      }
    },
    {
      "update": {
        "_index": "forum",
        "_type": "article",
        "_id": "5",
        "_version": 8,
        "result": "updated",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "status": 200
      }
    }
  ]
}

我们把new_author_first_name和new_author_last_name合并成为一个项new_author_full_name，但是这个region并不存在于我们文档的source里。当我们这么定义我们的mapping的话，在文档被索引之后，有一个新的new_author_full_name项可以供我们进行搜索。

如果想要显示new_author_full_name 设置映射时"store": true

PUT /forum/_mapping/article
{
  "properties": {
      "new_author_first_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      },
      "new_author_last_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      },
      "new_author_full_name": {
          "type":     "text",
          "store": true
      }
  }
}

查询时

GET /forum/article/1?stored_fields=full_name

查询new_author_full_name包含Peter Smith的

GET /forum/article/_search
{
  "query": {
    "match": {
      "new_author_full_name":      {
        "query": "Peter Smith",
        "operator": "and"
      }
    }
  }
}

相当于

GET /forum/article/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "new_author_full_name": "Peter"
          }
        },
        {
          "match": {
            "new_author_full_name": "Smith"
          }
        }
      ]
    }
  }
}

响应结果

{
  "took": 0,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0.51623213,
    "hits": [
      {
        "_index": "forum",
        "_type": "article",
        "_id": "1",
        "_score": 0.51623213,
        "_source": {
          "articleID": "XHDK-A-1293-#fJ3",
          "userID": 1,
          "hidden": false,
          "postDate": "2017-01-01",
          "tag": [
            "java",
            "hadoop"
          ],
          "tag_cnt": 2,
          "view_cnt": 30,
          "title": "this is java and elasticsearch blog",
          "content": "i like to write best elasticsearch article",
          "sub_title": "learning more courses",
          "author_first_name": "Peter",
          "author_last_name": "Smith",
          "new_author_last_name": "Smith",
          "new_author_first_name": "Peter"
        }
      },
      {
        "_index": "forum",
        "_type": "article",
        "_id": "5",
        "_score": 0.5063205,
        "_source": {
          "articleID": "DHJK-B-1395-#Ky5",
          "userID": 3,
          "hidden": false,
          "postDate": "2021-11-11",
          "tag": [
            "elasticsearch"
          ],
          "tag_cnt": 1,
          "view_cnt": 10,
          "title": "this is spark blog",
          "content": "spark is best big data solution based on scala ,an programming language similar to java",
          "sub_title": "haha, hello world",
          "author_first_name": "Tonny",
          "author_last_name": "Peter Smith",
          "new_author_last_name": "Peter Smith",
          "new_author_first_name": "Tonny"
        }
      }
    ]
  }
}

问题1：只是找到尽可能多的field匹配的doc，而不是某个field完全匹配的doc --> 解决，最完全匹配的document被最先返回

问题2：most_fields，没办法用minimum_should_match去掉长尾数据，就是匹配的特别少的结果 --> 解决，可以使用minimum_should_match去掉长尾数据

GET /forum/article/_search
{
  "query": {
    "bool": {
      "should": [
        {"match": {
          "new_author_full_name": "Peter"
        }},{
          "match": {
            "new_author_full_name": "Smith"
          }
        }
      ],
      "minimum_should_match": 2    }
  }
}

响应结果

{
  "took": 0,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0.51623213,
    "hits": [
      {
        "_index": "forum",
        "_type": "article",
        "_id": "1",
        "_score": 0.51623213,
        "_source": {
          "articleID": "XHDK-A-1293-#fJ3",
          "userID": 1,
          "hidden": false,
          "postDate": "2017-01-01",
          "tag": [
            "java",
            "hadoop"
          ],
          "tag_cnt": 2,
          "view_cnt": 30,
          "title": "this is java and elasticsearch blog",
          "content": "i like to write best elasticsearch article",
          "sub_title": "learning more courses",
          "author_first_name": "Peter",
          "author_last_name": "Smith",
          "new_author_last_name": "Smith",
          "new_author_first_name": "Peter"
        }
      },
      {
        "_index": "forum",
        "_type": "article",
        "_id": "5",
        "_score": 0.5063205,
        "_source": {
          "articleID": "DHJK-B-1395-#Ky5",
          "userID": 3,
          "hidden": false,
          "postDate": "2021-11-11",
          "tag": [
            "elasticsearch"
          ],
          "tag_cnt": 1,
          "view_cnt": 10,
          "title": "this is spark blog",
          "content": "spark is best big data solution based on scala ,an programming language similar to java",
          "sub_title": "haha, hello world",
          "author_first_name": "Tonny",
          "author_last_name": "Peter Smith",
          "new_author_last_name": "Peter Smith",
          "new_author_first_name": "Tonny"
        }
      }
    ]
  }
}

问题3：TF/IDF算法，比如Peter Smith和Smith Williams，搜索Peter Smith的时候，由于first_name中很少有Smith的，所以query在所有document中的频率很低，得到的分数很高，可能Smith Williams反而会排在Peter Smith前面 --> 解决，Smith和Peter在一个field了，所以在所有document中出现的次数是均匀的，不会有极端的偏差

关注

打赏

1688896170

查看更多评论

16深度探秘搜索技术_使用copy_to定制组合field解决cross-fields搜索弊端

[ 申请 ]友情链接：