1. 1. 说明
  2. 2. 创建mapping
  3. 3. 基本查询
    1. 3.1. 查询所有,指定返回数量,设置分页
    2. 3.2. match查询
    3. 3.3. 多字段查询
    4. 3.4. term查询
    5. 3.5. terms查询(多词条)
    6. 3.6. 词条(Term)查询-排序(Sorted)
    7. 3.7. range 范围查询
    8. 3.8. bool查询
    9. 3.9. highlighting高亮
    10. 3.10. 模糊(fuzzy)查询
    11. 3.11. 通配符(wildcard)查询
    12. 3.12. match_phrase查询(短语查询)
    13. 3.13. 短语前缀(Match Phrase Prefix)查询
    14. 3.14. aggregation聚合

说明

通过elasticsearch_dsl提供的API操作es进行基本查询操作

创建mapping

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from elasticsearch_dsl import DocType, Date, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])
# connections.create_connection(hosts=["http://admin:admin@127.0.0.1:8080"]) # 使用认证连接
class ZhihuQ(DocType):
id = Integer()
zhihu_id = Integer() # 知乎idlong
topics = Keyword() # 主题text
url = Keyword() # urlkeyword
title = Text(analyzer="ik_max_word") # 标题text
content = Text(analyzer="ik_max_word") # 内容text
answer_num = Integer() # 回答数量int
comments_num = Integer() # 评论数量int
watch_user_num = Integer() # 关注者数量int
click_num = Integer() # 浏览数量int
crawl_time = Date(format='date_optional_time||yyyy-MM-dd HH:mm:ss') # 创建时间
class Meta:
index = "zhihuquestion"
doc_type = "question"
if __name__ == "__main__":
ZhihuQ.init() # 初始化mapping

基本查询

查询所有,指定返回数量,设置分页
1
2
3
4
5
6
7
8
9
10
11
12
13
GET zhihuquestion/_search
{
"query": {
"match_all": {}
},
"from": 0,
"size": 5
}
# python:
search = ZhihuQ.search()
result = search.query().extra(size=1000)[0:5]
data = result.execute()
data = data.to_dict()['hits']['hits']
match查询

使用分词处理后查询

1
2
3
4
5
6
7
8
9
10
11
12
GET zhihuquestion/_search
{
"query": {
"match": {
"title": "python"
}
}
}
# python
result = search.query("match", title='Python')
data = result.execute()
data = data.to_dict()['hits']['hits']

多字段查询
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
GET zhihuquestion/_search
{
"query": {
"multi_match": {
"query": "python",
"fields": [
"title",
"content"
]
}
}
}
# python
multi_match = MultiMatch(query='python', fields=['title', 'content'])
q = search.query(multi_match)
result = q.query()
data = result.execute()
data = data.to_dict()['hits']['hits']
term查询

值不做解析处理,直接查询,完全匹配

1
2
3
4
5
6
7
8
9
10
11
12
GET zhihuquestion/_search
{
"query": {
"term": {
"topics": "python"
}
}
}
# python
result = search.query('term', title='Python')
data = result.execute()
data = data.to_dict()['hits']['hits']

terms查询(多词条)

任何一个满足都可以返回数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
GET zhihuquestion/_search
{
"query": {
"terms": {
"topics": [
"python",
"Python"
]
}
}
}
# python
result = search.query('terms', topics=['Python', 'python'])
data = result.execute()
data = data.to_dict()['hits']['hits']

词条(Term)查询-排序(Sorted)

指定返回字段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
GET zhihuquestion/_search
{
"sort": [
{
"crawl_time": {
"order": "asc"
}
}
],
"query": {
"terms": {
"title": [
"Python",
"python"
]
}
},
"_source": [
"crawl_time"
]
}
# python
result = search.query('terms', title=['Python', 'python']).source(['crawl_time']).sort({'crawl_time':{"order" : "asc",}})
data = result.execute()
data = data.to_dict()['hits']['hits']

range 范围查询

用于日期、数字和字符串类型的字段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
GET zhihuquestion/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "Python"
}
},
{
"range": {
"crawl_time": {
"gte": "2017-12-21 19:19:44",
"lte": "now"
}
}
}
]
}
}
}
# python
result = search.query("match", title='Python').query("range", crawl_time={'gte':'2017-12-21 19:19:44','lte': 'now'})
data = result.execute()
data = data.to_dict()['hits']['hits']

bool查询
  • must 等同于 AND
  • must_not 等同于 NOT
  • should 等同于 OR
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
GET zhihuquestion/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "python"
}
},
{
"match": {
"title": "c"
}
}
]
}
}
}
# python
q = Q("match", title='python') & Q("match", title='c')
q = Q("match", title='python') | Q("match", title='c')
q = ~Q("match", title='python') & ~Q("match", title='c')
result = search.query(q)
data = result.execute()
data = data.to_dict()['hits']['hits']
highlighting高亮
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
GET zhihuquestion/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "python"
}
},
{
"match": {
"title": "c"
}
}
]
}
},
"highlight": {
"fields": {
"title": {
"fragment_size": 50
}
}
}
}
# python
q = Q("match", title='python') & Q("match", title='c')
s = search.query(q)
result = s.highlight('title', fragment_size=50)
data = result.execute()
data = data.to_dict()['hits']['hits']
模糊(fuzzy)查询

在 Match检索 和多匹配检索中可以启用模糊匹配来捕捉拼写错误;模糊度是基于原始单词的编辑距离来指定,当术语长度大于 5 个字符时,AUTO 的模糊值等同于指定值 “2”。但是,80% 拼写错误的编辑距离为 1,所以,将模糊值设置为 1 可能会提高您的整体搜索性能

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
GET zhihuquestion/_search
{
"query": {
"multi_match": {
"fields": [
"title",
"content"
],
"fuzziness": "AUTO",
"query": "Python"
}
}
}
# python
multi_match = MultiMatch(query='Python', fields=['title', 'content'], fuzziness='AUTO')
q = search.query(multi_match)
result = q.query()
data = result.execute()
data = data.to_dict()['hits']['hits']

通配符(wildcard)查询

通配符查询允许指定匹配的模式,而不是整个词组(term)检索

? 匹配任何字符, * 匹配零个或多个字符

1
2
3
4
5
6
7
8
9
10
11
12
GET zhihuquestion/_search
{
"query": {
"wildcard": {
"title": "pyth*"
}
}
}
# python
result = search.query('wildcard', title='python*')
data = result.execute()
data = data.to_dict()['hits']['hits']

match_phrase查询(短语查询)

匹配短语查询要求查询字符串中的所有词都存在于文档中,按照查询字符串中指定的顺序并且彼此靠近;默认情况下,这些词必须完全相邻,但可以指定偏离值(slop value),该值指示在仍然考虑文档匹配的情况下词与词之间的偏离值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
GET zhihuquestion/_search
{
"query": {
"multi_match": {
"fields": [
"title"
],
"type": "phrase",
"slop": 6,
"query": "python下载"
}
}
}
# python
multi_match = MultiMatch(query='python下载', fields=['title'],
type='phrase',
slop=4
)
q = search.query(multi_match)
result = q.query()
data = result.execute()
data = data.to_dict()['hits']['hits']

短语前缀(Match Phrase Prefix)查询

匹配词组前缀查询在查询时提供搜索即时类型或“相对简单”的自动完成版本,而无需以任何方式准备数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
GET zhihuquestion/_search
{
"query": {
"multi_match": {
"fields": [
"title"
],
"type": "phrase_prefix",
"slop": 2,
"query": "python精通"
}
}
}
# python
multi_match = MultiMatch(query='python精通', fields=['title'],
type='phrase_prefix',
slop=2
)
q = search.query(multi_match).source(['title'])
result = q.query()
data = result.execute()
data = data.to_dict()['hits']['hits']

aggregation聚合

聚合类型: Bucketing, Metric, Matrix, Pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# metric 计算相关, max, min, avg等
GET zhihuquestion/_search
{
"query": {
"match_all": {}
},
"aggs": {
"max_click_num": {
"max": {
"field": "click_num"
}
}
}
}
# python
_agg = A('max', field='click_num')
result = search.aggs.metric('max_click_num', _agg)
data = result.execute()
print data.aggregations.max_click_num # 得到点击量最大的值
# bucket
_agg = A('terms', field='comment')
f = search.aggs.bucket('bucket_comment', _agg)
query_word = json.dumps(f.to_dict())
print query_word
response = search.execute()
print response.aggregations.bucket_comment.buckets
print response.to_dict()