elasticsearch查询之keyword字段的查询相关度评分控制

一、数据情况

purchase记录每个用户的购买信息;

PUT purchase {     "mappings":{         "properties":{             "id":{                 "type":"keyword"             },             "name":{                 "type":"text"             },             "goods":{                 "properties":{                     "id":{                         "type":"keyword"                     },                     "name":{                         "type":"text"                     }                 }             }         }     } } 

index 三个document

PUT purchase/_doc/1 {   "id":1,   "name":"sam",   "goods":[     {"id":"g1","name":"ipad"},     {"id":"g2","name":"iphone"}   ] }   PUT purchase/_doc/2 {   "id":2,   "name":"coco",   "goods":[     {"id":"g1","name":"ipad"},     {"id":"g2","name":"iphone"},     {"id":"g3","name":"ipod"}   ] }   PUT purchase/_doc/3 {   "id":3,   "name":"jim",   "goods":[     {"id":"g1","name":"ipad"},     {"id":"g2","name":"iphone"},     {"id":"g3","name":"ipod"},     {"id":"g4","name":"TV"}   ] } 

查看索引数据情况

POST purchase/_search {   "query": {     "match_all": {}   } }  
{     "took":331,     "timed_out":false,     "_shards":{         "total":1,         "successful":1,         "skipped":0,         "failed":0     },     "hits":{         "total":{             "value":3,             "relation":"eq"         },         "max_score":1,         "hits":[             {                 "_index":"purchase",                 "_id":"1",                 "_score":1,                 "_source":{                     "id":1,                     "name":"sam",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         }                     ]                 }             },             {                 "_index":"purchase",                 "_id":"2",                 "_score":1,                 "_source":{                     "id":2,                     "name":"coco",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         },                         {                             "id":"g3",                             "name":"ipod"                         }                     ]                 }             },             {                 "_index":"purchase",                 "_id":"3",                 "_score":1,                 "_source":{                     "id":3,                     "name":"jim",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         },                         {                             "id":"g3",                             "name":"ipod"                         },                         {                             "id":"g4",                             "name":"TV"                         }                     ]                 }             }         ]     } } 

二、查询需求

我们需要查询购买过某种商品的顾客,一般我们可以通过ui的业务逻辑得到需要筛选的一些商品的id,由于id字段是一个不需要分词的keyword字段,所以我们会直接使用term级别的查询;

 POST purchase/_search {   "query": {     "terms": {       "goods.id": [         "g2",         "g3",         "g4"       ]     }   } } 

我们可以看到查询结果中的三条记录的权重打分都是1;正常情况下购买商品越多的客户,相对来说价值更大即命中的权重得分越大;

{     "took":0,     "timed_out":false,     "_shards":{         "total":1,         "successful":1,         "skipped":0,         "failed":0     },     "hits":{         "total":{             "value":3,             "relation":"eq"         },         "max_score":1,         "hits":[             {                 "_index":"purchase",                 "_id":"1",                 "_score":1,                 "_source":{                     "id":1,                     "name":"sam",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         }                     ]                 }             },             {                 "_index":"purchase",                 "_id":"2",                 "_score":1,                 "_source":{                     "id":2,                     "name":"coco",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         },                         {                             "id":"g3",                             "name":"ipod"                         }                     ]                 }             },             {                 "_index":"purchase",                 "_id":"3",                 "_score":1,                 "_source":{                     "id":3,                     "name":"jim",                     "goods":[                         {                             "id":"g1",                             "name":"ipad"                         },                         {                             "id":"g2",                             "name":"iphone"                         },                         {                             "id":"g3",                             "name":"ipod"                         },                         {                             "id":"g4",                             "name":"TV"                         }                     ]                 }             }         ]     } } 

三、terms查询分析

我们使用_explain分析一下terms查询怎么打分的;

POST purchase/_explain/3 {   "query": {     "terms": {       "goods.id": [         "g2",         "g3",         "g4"       ]     }   } } 

我们可以看到elasticsearch最终使用ConstantScore查询重写的terms查询,此查询默认权重打分为1;

{   "_index" : "purchase",   "_id" : "3",   "matched" : true,   "explanation" : {     "value" : 1.0,     "description" : "ConstantScore(goods.id:g2 goods.id:g3 goods.id:g4)",     "details" : [ ]   } } 

terms提供的查询参数十分有限,其中涉及权重的只有boost,但是这只是针对整个terms查询,而不是内部的子查询;

POST purchase/_explain/3 {   "query": {     "terms": {       "goods.id": [         "g2",         "g3",         "g4"       ],       "boost":2     }   } }  {   "_index" : "purchase",   "_id" : "3",   "matched" : true,   "explanation" : {     "value" : 2.0,     "description" : "ConstantScore(goods.id:g2 goods.id:g3 goods.id:g4)^2.0",     "details" : [ ]   } }  

四、构建子查询打分

match是elasticsearch提供的一个跟terms类似的查询,由于goods.id的type是keyword,所以需要给match指定一个查询时的analyzer,才能保证输入的几个id分开作为不同的查询;

POST purchase/_search {   "query": {     "match": {       "goods.id": {         "query": "g2 g3 g4",         "analyzer":"standard"       }     }   } }   {   "took" : 1,   "timed_out" : false,   "_shards" : {     "total" : 1,     "successful" : 1,     "skipped" : 0,     "failed" : 0   },   "hits" : {     "total" : {       "value" : 3,       "relation" : "eq"     },     "max_score" : 2.178501,     "hits" : [       {         "_index" : "purchase",         "_id" : "3",         "_score" : 2.178501,         "_source" : {           "id" : 3,           "name" : "jim",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             },             {               "id" : "g4",               "name" : "TV"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "2",         "_score" : 0.8298607,         "_source" : {           "id" : 2,           "name" : "coco",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "1",         "_score" : 0.18360566,         "_source" : {           "id" : 1,           "name" : "sam",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             }           ]         }       }     ]   } }  

通过查看文档3的打分情况,我们可以看到elasticsearch先针对每个关键字计算打分,然后将三项打分的和作为最终的打分;在这里我们也可以看到elasticsearch内部会自动将match查询rewrite为三个子查询;

POST purchase/_explain/3 {   "query": {     "match": {       "goods.id": {         "query": "g2 g3 g4",         "analyzer":"standard"       }     }   } }  {   "_index" : "purchase",   "_id" : "3",   "matched" : true,   "explanation" : {     "value" : 2.178501,     "description" : "sum of:",     "details" : [       {         "value" : 0.18360566,         "description" : "weight(goods.id:g2 in 2) [PerFieldSimilarity], result of:",         "details" : []       },       {         "value" : 0.646255,         "description" : "weight(goods.id:g3 in 2) [PerFieldSimilarity], result of:",         "details" : []       },       {         "value" : 1.3486402,         "description" : "weight(goods.id:g4 in 2) [PerFieldSimilarity], result of:",         "details" : []       }     ]   } } 

我们也可以通过bool查询,使用它的should在查询之前手动组建多个子查询;

POST purchase/_search {   "query": {     "bool": {       "should": [         {"term": {"goods.id": "g2"}},         {"term": {"goods.id": "g3"}},         {"term": {"goods.id": "g4"}}       ],       "minimum_should_match": 1     }   } }  {   "took" : 1,   "timed_out" : false,   "_shards" : {     "total" : 1,     "successful" : 1,     "skipped" : 0,     "failed" : 0   },   "hits" : {     "total" : {       "value" : 3,       "relation" : "eq"     },     "max_score" : 2.178501,     "hits" : [       {         "_index" : "purchase",         "_id" : "3",         "_score" : 2.178501,         "_source" : {           "id" : 3,           "name" : "jim",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             },             {               "id" : "g4",               "name" : "TV"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "2",         "_score" : 0.8298607,         "_source" : {           "id" : 2,           "name" : "coco",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "1",         "_score" : 0.18360566,         "_source" : {           "id" : 1,           "name" : "sam",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             }           ]         }       }     ]   } } 

在bool查询中,通过查看文档3的打分情况,我们可以看到elasticsearch也是先针对每个关键字计算打分,然后将三项打分的和作为最终的打分;

POST purchase/_explain/3 {   "query": {     "bool": {       "should": [         {"term": {"goods.id": "g2"}},         {"term": {"goods.id": "g3"}},         {"term": {"goods.id": "g4"}}       ],       "minimum_should_match": 1     }   } }  {   "_index" : "purchase",   "_id" : "3",   "matched" : true,   "explanation" : {     "value" : 2.178501,     "description" : "sum of:",     "details" : [       {         "value" : 0.18360566,         "description" : "weight(goods.id:g2 in 2) [PerFieldSimilarity], result of:",         "details" : []       },       {         "value" : 0.646255,         "description" : "weight(goods.id:g3 in 2) [PerFieldSimilarity], result of:",         "details" : []       },       {         "value" : 1.3486402,         "description" : "weight(goods.id:g4 in 2) [PerFieldSimilarity], result of:",         "details" : []       }     ]   } } 

五、控制子查询的打分

不管是elasticsearch自动组建子查询,还是我们自己手动构建子查询,elasticsearch都会针对每个查询做相关性的打分计算,这对于一般的语义化关键字搜索是没有问题的;

我们这里的搜索条件goods.id一般是没有任何语义的,不同的值打分应该是一样的;这样我们只能使用bool+constant_score+term来手动构建查询语句;

POST purchase/_search {   "query": {     "bool": {       "should": [         {"constant_score": {"filter": {"term": {"goods.id": "g2"}}}},         {"constant_score": {"filter": {"term": {"goods.id": "g3"}}}},         {"constant_score": {"filter": {"term": {"goods.id": "g4"}}}}       ],       "minimum_should_match": 1     }   } }   {   "took" : 0,   "timed_out" : false,   "_shards" : {     "total" : 1,     "successful" : 1,     "skipped" : 0,     "failed" : 0   },   "hits" : {     "total" : {       "value" : 3,       "relation" : "eq"     },     "max_score" : 3.0,     "hits" : [       {         "_index" : "purchase",         "_id" : "3",         "_score" : 3.0,         "_source" : {           "id" : 3,           "name" : "jim",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             },             {               "id" : "g4",               "name" : "TV"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "2",         "_score" : 2.0,         "_source" : {           "id" : 2,           "name" : "coco",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             },             {               "id" : "g3",               "name" : "ipod"             }           ]         }       },       {         "_index" : "purchase",         "_id" : "1",         "_score" : 1.0,         "_source" : {           "id" : 1,           "name" : "sam",           "goods" : [             {               "id" : "g1",               "name" : "ipad"             },             {               "id" : "g2",               "name" : "iphone"             }           ]         }       }     ]   } }  

我们看下文档3的打分情况,每一个命中项的打分都是固定的1,最终的打分命中项的和;

POST purchase/_explain/3 {   "query": {     "bool": {       "should": [         {"constant_score": {"filter": {"term": {"goods.id": "g2"}}}},         {"constant_score": {"filter": {"term": {"goods.id": "g3"}}}},         {"constant_score": {"filter": {"term": {"goods.id": "g4"}}}}       ],       "minimum_should_match": 1     }   } }  {   "_index" : "purchase",   "_id" : "3",   "matched" : true,   "explanation" : {     "value" : 3.0,     "description" : "sum of:",     "details" : [       {         "value" : 1.0,         "description" : "ConstantScore(goods.id:g2)",         "details" : [ ]       },       {         "value" : 1.0,         "description" : "ConstantScore(goods.id:g3)",         "details" : [ ]       },       {         "value" : 1.0,         "description" : "ConstantScore(goods.id:g4)",         "details" : [ ]       }     ]   } }  

发表评论

评论已关闭。

相关文章