Elastic Search

[Elastic Search] MBTI ๊ฒ€์ƒ‰ ํ”„๋กœ์ ํŠธ - 1. ๊ฒ€์ƒ‰ Score ํŠœ๋‹

Tempo 2022. 4. 12. 22:14

ํ˜„์žฌ ์—˜๋ผ์Šคํ‹ฑ์„œ์น˜๋ฅผ ์ด์šฉํ•ด ์ˆ˜์ง‘ํ•œ ๋ฐ์ดํ„ฐ(MBTI ํƒ€์ž…๋ณ„ ํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ)๋ฅผ ์กฐํšŒํ•˜๋Š” ํ”„๋กœ์ ํŠธ๋ฅผ ์ง„ํ–‰ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ์—ฌ๊ธฐ์„œ MBTI ํƒ€์ž…๊ณผ ์Šค๋งˆํŠธํฐ(์•„์ดํฐ ๋˜๋Š” ๊ฐค๋Ÿญ์‹œ)์˜ ์ƒ๊ด€์„ฑ์„ ๋ถ„์„ํ•˜๊ธฐ ์œ„ํ•ด ES์˜ ์ฟผ๋ฆฌ๋ฅผ ํŠœ๋‹ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.

 

์ธ๋ฑ์Šค ๊ตฌ์„ฑ

์ฝ˜ํ…์ธ  ๋‚ด๋ถ€์—์„œ ๋ช…์‚ฌ๋งŒ ์ถ”์ถœํ•˜์—ฌ ๋ถ„์„ํ•˜๊ธฐ ์œ„ํ•ด nori_noun์ด๋ผ๋Š” ๋ถ„์„๊ธฐ๋ฅผ ๋ณ„๋„๋กœ ์ƒ์„ฑํ•˜์—ฌ ํ•„๋“œ๋กœ ์„ค์ •ํ–ˆ์Šต๋‹ˆ๋‹ค.

{
  "mbti" : {
    "aliases" : { },
    "mappings" : {
      "properties" : {
        "comment_cnt" : {
          "type" : "integer"
        },
        "contents" : {
          "type" : "text",
          "fields" : {
            "full" : {
              "type" : "keyword"
            },
            "nori_mixed" : {
              "type" : "text",
              "analyzer" : "nori_mixed",
              "search_analyzer" : "standard"
            },
            "nori_noun" : {
              "type" : "text",
              "analyzer" : "nori_pos_noun",
              "search_analyzer" : "standard"
            }
          }
        },
        "doc_url" : {
          "type" : "text"
        },
        "keyword" : {
          "type" : "keyword"
        },
        "like_cnt" : {
          "type" : "integer"
        },
        "platform" : {
          "type" : "keyword"
        },
        "published_at" : {
          "type" : "date"
        },
        "title" : {
          "type" : "text"
        },
        "writer" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
      }
    },
    "settings" : {
      "index" : {
        "routing" : {
          "allocation" : {
            "include" : {
              "_tier_preference" : "data_content"
            }
          }
        },
        "number_of_shards" : "1",
        "provided_name" : "mbti",
        "creation_date" : "1649761141991",
        "analysis" : {
          "filter" : {
            "pos_filter" : {
              "type" : "nori_part_of_speech",
              "stoptags" : [
                "VV",
                "VA",
                "VX",
                "VCP",
                "VCN",
                "MM",
                "MAG",
                "MAJ",
                "IC",
                "J",
                "E",
                "XPN",
                "XSA",
                "XSN",
                "XSV",
                "SP",
                "SSC",
                "SSO",
                "SC",
                "SE",
                "UNA"
              ]
            }
          },
          "analyzer" : {
            "nori_mixed" : {
              "filter" : "shingle",
              "tokenizer" : "nori_t_mixed"
            },
            "nori_pos_noun" : {
              "filter" : "pos_filter",
              "type" : "custom",
              "tokenizer" : "nori_t_mixed"
            }
          },
          "tokenizer" : {
            "nori_t_mixed" : {
              "type" : "nori_tokenizer",
              "decompound_mode" : "mixed"
            }
          }
        },
        "number_of_replicas" : "1",
        "uuid" : "e8w9oHlLSyqF5oDzviz0KA",
        "version" : {
          "created" : "7170299"
        }
      }
    }
  }
}

๊ฐ MBTI ํƒ€์ž…๋ณ„ ์Šค๋งˆํŠธํฐ ์„ ํ˜ธ๋„

GET mbti/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "contents.nori_noun": {
              "query": "ENFP"
            }
          }
        },
        {
          "match_phrase": {
            "contents": {
              "query": "์•„์ดํฐ",
              "boost": 2
            }
          }
        }
        ]
    }
  }
}

Match ์ฟผ๋ฆฌ์— ์ฝ˜ํ…์ธ  ๋‚ด๋ถ€์— MBTI ํƒ€์ž…์ด ์žˆ๋Š”์ง€์— ๋Œ€ํ•ด ์šฐ์„  ์กฐํšŒํ•œ๋‹ค. ๊ทธ๋ฆฌ๊ณ  match_phrase๋กœ ์•„์ดํฐ์— boost๋ฅผ ํ•˜์—ฌ ์•„์ดํฐ์ด ํฌํ•จ๋œ ์›๋ฌธ์ด ์ƒ์œ„ ๊ฒ€์ƒ‰๋˜๋„๋ก ์ฟผ๋ฆฌ๋ฅผ ๋งŒ๋“ค์—ˆ์Šต๋‹ˆ๋‹ค.

์‚ฌ์‹ค match_phrase ๋Š” ๊ตฌ๋ฌธ, ์ฆ‰ โ€œ์•„์ดํฐ ๊ฟ€ํŒโ€์ฒ˜๋Ÿผ ๊ตฌ๋ฌธ์˜ ๋งค์นญ ์ •๋„๋ฅผ boost ํ•ด์ฃผ๊ธฐ ๋•Œ๋ฌธ์— ์œ„ ์‚ฌ๋ก€์—์„œ๋Š” ์ ์ ˆํ•œ ์‚ฌ์šฉ ์‚ฌ๋ก€๋Š” ์•„๋‹™๋‹ˆ๋‹ค. ์›๋ž˜๋Š” โ€œENFP ์•„์ดํฐโ€์œผ๋กœ ๊ฒ€์ƒ‰ํ•˜์—ฌ ๊ฐ ๋‹จ์–ด ๊ฐ„์˜ Distance๋ฅผ ๊ฒ€์ƒ‰ Score ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ ์‚ฌ์šฉํ•˜๋ ค๊ณ  ํ–ˆ์ง€๋งŒ ์‹คํŒจํ•˜์—ฌ ์šฐ์„  match_phrase๋ฅผ ์‚ฌ์šฉํ•˜์˜€์Šต๋‹ˆ๋‹ค.

 

ํ•ด๊ฒฐ๋ฐฉ๋ฒ•์€? (์•„๋ž˜๋ถ€ํ„ฐ๋Š” ์‚ฝ์งˆ์˜ ์—ญ์‚ฌ์ž…๋‹ˆ๋‹ค...)

#1 Should ๋ฅผ ์‚ฌ์šฉํ•œ ์กฐํšŒ

Should์˜ ๊ฒฝ์šฐ ํ”ํžˆ ๋งํ•˜๋Š” or ์กฐ๊ฑด์œผ๋กœ ์กฐ๊ฑด์— ํฌํ•จ๋œ ๋‹จ์–ด๊ฐ€ ์ƒ์œ„๋กœ ์˜ฌ๋ผ์˜ค๋„๋ก(์ฆ‰ score ์ ์ˆ˜๊ฐ€ ๋” ๋†’์•„์งˆ ์ˆ˜ ์žˆ๋„๋ก) ์ฟผ๋ฆฌ ๋ณ€๊ฒฝ

GET mbti/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "contents.nori_noun": {
              "query": "ENFP"
            }
          }
        }
        ],
      "should": [
        {
          "match": {
            "contents": {
              "query": "์•„์ดํฐ",
              "boost": 2
            }
          }
        },
        {
          "match": {
            "contents": {
              "query": "ENFP",
              "boost": 1
            }
          }
        }
        ]
    }
  }
}
๋ฐ˜์‘ํ˜•