Re-Index
๋ง์ ์ด๋ชจํฐ์ฝ ๊ฒ์์ ํด๋ณด๋ ๊ฐ ์๋ฌธ์์ ์ด๋ชจํฐ์ฝ์ด ์ผ๋ง๋ ํฌํจ๋์ด ์๋์ง, ์ด๋ค ์ด๋ชจํฐ์ฝ์ด ๊ฐ์ฅ ๋ง์ด ์๋์ง ๊ฒ์ํด๋ณด์
๊ทธ์ ์ ์ฌ์ ์ค๋น ์์ ์ผ๋ก text field๋ก ๋ค์ด๊ฐ ๋ฐ์ดํฐ์์ ํค์๋๋ฅผ ์ถ์ถ(Es ๋ด๋ถ์์๋ Term)ํ ์ ์๋๋ก ์ธ๋ฑ์ค๋ฅผ ๊ตฌ์ฑํ๊ณ ์ ์ฒด ๋ฌธ์์์ ํค์๋ ๋น๋์๋ฅผ ๊ธฐ์ค์ผ๋ก ๋ฐ์ดํฐ๋ฅผ ์ถ์ถํ๋ ๋ฐฉ๋ฒ์ ์ฐพ์๋ด ๋๋ค.
์ธ๋ฑ์ค ๊ตฌ์ฑ
PUT /mbti_term
{
"settings": {
"analysis": {
"analyzer": {
"nori_mixed": {
"tokenizer": "nori_t_mixed",
"filter": "shingle"
},
"nori_pos_noun": {
"type": "custom",
"tokenizer": "nori_t_mixed",
"filter": "pos_filter"
}
},
"tokenizer": {
"nori_t_mixed": {
"type": "nori_tokenizer",
"decompound_mode": "mixed"
}
},
"filter": {
"pos_filter": {
"type": "nori_part_of_speech",
"stoptags": [
"VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ",
"IC", "J", "E",
"XPN", "XSA", "XSN", "XSV",
"SP", "SSC", "SSO", "SC", "SE",
"UNA"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text"
},
"contents": {
"type": "text",
"fields": {
"full": {
"type": "keyword"
},
"nori_mixed": {
"type": "text",
"analyzer": "nori_mixed",
"search_analyzer": "standard",
"fielddata": true,
"term_vector": "yes"
},
"nori_noun": {
"type": "text",
"analyzer": "nori_pos_noun",
"search_analyzer": "standard",
"fielddata": true,
"term_vector": "yes"
}
},
"fielddata": true,
"term_vector": "yes"
},
"keyword": {
"type": "keyword"
},
"platform": {
"type": "keyword"
},
"published_at": {
"type": "date"
},
"doc_url": {
"type": "text"
},
"comment_cnt": {
"type": "integer"
},
"like_cnt": {
"type": "integer"
}
}
}
}
์ฌ๊ธฐ์ ํต์ฌ์ ๋์ ํ๋์์ term_vector,fielddata ์ด๋ค.
"contents": {
"type": "text",
"fields": {
"full": {
"type": "keyword"
},
"nori_mixed": {
"type": "text",
"analyzer": "nori_mixed",
"search_analyzer": "standard",
"fielddata": true,
"term_vector": "yes"
},
"nori_noun": {
"type": "text",
"analyzer": "nori_pos_noun",
"search_analyzer": "standard",
"fielddata": true,
"term_vector": "yes"
}
},
"fielddata": true,
"term_vector": "yes"
}
์ด๋ฅผ ๋ฐํ์ผ๋ก ๋ฌธ์์ ์๋ ๋จ์ด(term)์ ์๋ฅผ ๊ณ์ฐํ๋ ์ฟผ๋ฆฌ๋ฅผ ๋ณด๋ฉด ์๋์ ๊ฐ๋ค.
GET mbti_term/_search
{
"size": 0,
"aggs": {
"term_cnt": {
"terms": {
//contents.nori_mixed, contents.nori_noun ๋ฑ์ผ๋ก๋ ๊ฐ๋ฅ
"field": "contents",
"size": 1000
}
}
}
}
์ฌ๊ธฐ์ ๋จ์ด์ ๊ธธ์ด๋ก ์ ๋ ฌํ์ฌ ๋ชฉ๋ก์ ์ถ๋ ฅํ๋ ๋ฐฉ๋ฒ๋ ์๋ค. - ๋งํฌ (ํ์ง๋ง 502 Bad Gateway ์๋ฌ๊ฐ ๋ฐ์ํ๋ค..)
์์ ์ฟผ๋ฆฌ์ ๋ํ ๊ฒฐ๊ณผ๋ ์๋์ ๊ฐ๋ค.
{
"took" : 555,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"term_cnt" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 11058130,
"buckets" : [
{
"key" : "์ด",
"doc_count" : 15136
},
{
"key" : "ํ",
"doc_count" : 15088
},
{
"key" : "๋",
"doc_count" : 14448
},
{
"key" : "แซ",
"doc_count" : 13909
},
{
"key" : "๊ณ ",
"doc_count" : 13348
},
{
"key" : "์",
"doc_count" : 13032
},
{
"key" : "์",
"doc_count" : 13015
},
{
"key" : "๊ฐ",
"doc_count" : 12940
},
์ด๋ชจํฐ์ฝ ํ๋ ๊ตฌ์ฑ
๋ณธ๋ฌธ์์ ์ด๋ป๊ฒ ์ด๋ชจํฐ์ฝ๋ง ์ถ์ถํ ์ ์์๊น?
์๋ ์ฟผ๋ฆฌ๋ฅผ ์ฌ์ฉํ๋ฉด ์ด๋ชจํฐ์ฝ์ด ํฌํจ๋ ๋ณธ๋ฌธ๋ง ์ถ์ถํ ์ ์๋ค.
GET mbti_term/_search
{
"query": {
"regexp": {
"contents": "[\\uD83D\\uDE00-\\uD83D\\uDE4F, \\uD83C\\uDF00-\\uD83D\\uDDFF, \\uD83D\\uDE80-\\uD83D\\uDEFF, \\uD83C\\uDDE0-\\uD83C\\uDDFF]"
}
}
}
์ฝํ ์ธ ์์ ์ด๋ชจํฐ์ฝ์ด ํฌํจ๋ ์ฝํ ์ธ ๋ง ์ถ์ถํ๋ ๊ฒ์ ์๋ฃํ์ต๋๋ค. ํ์ง๋ง ์ด๋ชจํฐ์ฝ๋ง ๋ณ๋์ ํ๋๋ก ์ ์ฅํด์ ๋ถ์ํ ์ ์๋๋ก ๋ถ๋ฅํ๋ ๊ฒ ํ์ํ ๊ฒ ๊ฐ์ต๋๋ค.
๋ค์์๋ ์ค์ ์ธ๋ฑ์ค ํ๋์ ์ด๋ชจํฐ์ฝ ๊ฐ๋ง ์ ์ฅ๋ ์ ์๋๋ก ํ๋ ์ธ๋ฑ์ค ๊ตฌ์ฑ์ ์งํํด๋ณด๊ฒ ์ต๋๋ค.