Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応

Kunihiko Kido
September 16, 2014

Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応

第6回elasticsearch勉強会「Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応」資料

Kunihiko Kido

September 16, 2014
Tweet

More Decks by Kunihiko Kido

Other Decks in Technology

Transcript

  1. Ϛοϐϯάఆٛͱ໨త ̍ ̎ ̏ ̐ ̑ ̒ ద߹཰Λ ޲্͍ͨ͠ ࠶ݱ཰Λ

    ޲্͍ͨ͠ ਖ਼֬ʹ ߜΓࠐΈ͍ͨ ूܭ͍ͨ͠ ද͍ࣔͨ͠ BOENPSF ͔͠΋ϑΟʔϧυຖʹɻɻɻ ߟ͑Δ͜ͱ͕͍ͬͺ͍ʂ
  2. Ϛοϐϯάఆٛͱ໨త ̍ ̎ ̏ ̐ ̑ ̒ ׬શҰகͰ ݕࡧɾूܭ͍ͨ͠ ʮ౎ʯʮ෎ʯʮݝʯ

    Λলུͯ͠ݕࡧ͍ͨ͠ ϤϛͰݕࡧ͍ͨ͠ ౎ಓ෎ݝίʔυ ॱͰιʔτ͍ͨ͠ ʮؔ౦ʯͳͲ ผ໊Ͱݕࡧɾूܭ͍ͨ͠ BOENPSFʜ ྫ͑͹ɺ౎ಓ෎ݝ໊ϑΟʔϧυ̍ͭͱͬͯ΋
  3. %FNP DELETE /myindex! ! PUT /myindex/mytype/1! ! {! "title": "Elasticsearch

    ಛ௃·ͱΊ",! "description": "Elasticsearch Features — ओʹγεςϜΛத৺ͱͨ͠ಛ௃·ͱΊ",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": "2014-03-12T11:09"! } ͍ͭ΋௨Γɺ͜Μͳײ͡Ͱσʔλొ࿥͢Δ͚ͩͰɺ ϑΟʔϧυຖʹඞཁͳϚοϐϯάఆ͕ٛग़དྷ্͕Δ
  4. %FNP {! ...! "title" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! },! "yomi" : {! "type" : "string",! "analyzer" : "katakana"! }! }! }! ...! }! UJUMFϑΟʔϧυͷϚοϐϯάఆٛ ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ
  5. %FNP {! ...! "description" : {! "type" : "string",! "fields"

    : {! "substring" : {! "type" : "string",! "analyzer" : "cjk"! }! }! },! ...! } ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ EFTDSJQUJPOϑΟʔϧυͷϚοϐϯάఆٛ
  6. %FNP {! ...! "author" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! },! "yomi" : {! "type" : "string",! "analyzer" : "katakana"! }! }! }! ...! }! BVUIPSϑΟʔϧυͷϚοϐϯάఆٛ ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ
  7. %FNP {! ...! "link" : {! "type" : "string",! "index"

    : "not_analyzed",! "fields" : {! "domain" : {! "type" : "string",! "analyzer" : "domain_analyzer"! },! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "substring" : {! "type" : "string",! "analyzer" : "standard"! }! }! },! ...! } ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ MJOLϑΟʔϧυͷϚοϐϯάఆٛ
  8. %FNP {! ...! "tags" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! }! }! },! ...! }! ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ UBHTϑΟʔϧυͷϚοϐϯάఆٛ
  9. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch Overview",! "description": "Elasticsearch Features —

    System Overview",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “en",! } %FNP ʮӳޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ͜Ε͚ͩ
  10. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch ѐਃ",! "description": "Elasticsearch Features —

    दझమ ѐਃ",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “ko",! } %FNP ʮؖࠃޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ͍͕͍ͱ؆୯ʂ
  11. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch Überblick",! "description": "Elasticsearch Features —

    Systemübersicht",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “de",! } %FNP ʮυΠπޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ศར͔΋ʂ
  12. EZOBNJDUFNQMBUFT {! ...! "dynamic_templates": [! {! "my_field": {! "match": "*",

    /* ᶃ ϑΟʔϧυ໊ͷύλʔϯ */! "match_pattern": "regex", /* ᶄ matchͷϚονϯάํࣜ ʢলུՄʣ*/! "match_mapping_type": "string", /* ᶅ JSONϑΥʔϚοτͷλΠϓ */! "mapping": {...} /* ᶆ Ϛοϐϯάఆٛ */! }! },! ...(ෳ਺ఆٛͰ͖Δ)! ],! ...! }! ! جຊతͳઃఆ
  13. EZOBNJDUFNQMBUFT {! "url_fields": {! "match": ".*url|.*link",! "match_pattern": "regex",! "match_mapping_type": "string",!

    "mapping": {! "type": "string",! "index": "not_analyzed",! "fields": {! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "standard"! },! "domain": {! "type": "string",! "analyzer": "domain_analyzer"! }! }! }! }! }! ྫʣ63-ϑΟʔϧυ޲͚
  14. EZOBNJDUFNQMBUFT {! "special_string_fields": {! "match": ".*title.*|.*name.*|.*author.*",! "match_pattern": "regex",! "match_mapping_type": "string",!

    "mapping": {! "type": "string",! "fields": {! "raw": {! "type": "string",! "index": "not_analyzed"! },! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "ja-substring"! },! "yomi": {! "type": "string",! "analyzer": "katakana"! }! }! }! }! } ྫʣ໊শ౳ॏཁͳϑΟʔϧυ
  15. EZOBNJDUFNQMBUFT {! "long_string_fields": {! "match": ".*message.*|.*content.*|.*description.*| .*text.*|.*body.*|.*note.*|.*memo.*",! "match_pattern": "regex",! "match_mapping_type":

    "string",! "mapping": {! "type": "string",! "fields": {! "substring": {! "type": "string",! "analyzer": "ja-substring"! }! }! }! }! } ྫʣ௕จϑΟʔϧυ
  16. EZOBNJDUFNQMBUFT {! "short_string_fields": {! "match": "*",! "match_mapping_type": "string",! "mapping": {!

    "type": "string",! "fields": {! "raw": {! "type": "string",! "index": "not_analyzed"! },! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "ja-substring"! }! }! }! }! } ྫʣͦͷଞϑΟʔϧυ
  17. EZOBNJDUFNQMBUFT {! ...! "dynamic_templates": [! {! “url_field": {…}! },! {!

    "special_string_fields": {…}! },! {! "long_string_fields": {…}! },! {! "short_string_fields": {…}! },! ],! ...! } ࠷ॳʹύλʔϯʹϚονͨ͠Ϛο ϐϯά͕ఆٛ͞ΕΔ
  18. JOEFYUFNQMBUF DPOpHUFNQMBUFT഑Լʹอଘ͢Δ͚ͩͰ४උ0, ˝FMBTUJDTFBSDI ˝DPOpH ˝UFNQMBUFT CBTFKTPO DVTUPN@BOBMZ[FSTKTPO KBQBOFTF@BOBMZ[FSTKTPO MBOHVBHF@BOBMZ[FSTKTPO TUSJOH@pFMETKTPO

    %FNPͰ࢖͍ͬͯΔJOEFY UFNQMBUFϑΝΠϧͨͪ ˡ ɾϊʔυͷ࠶ىಈ͸ඞཁͳ͍ ɾ৽نͰ࡞੒ͨ͠ΠϯσοΫεͷΈʹద༻͞ΕΔ ɾςϯϓϨʔτ͸ෳ਺࡞ΕΔ ɾ"1*Ͱ΋ઃఆͰ͖Δ ɾEZOBNJDUFNQMBUFTͷఆٛ΋ؚΊΒΕΔ
  19. JOEFYUFNQMBUF {! "my_template":{ /* ςϯϓϨʔτͷ໊લ */! "template": "*", /* ςϯϓϨʔτΛద༻͢ΔΠϯσΫε໊ͷύλʔϯ

    */! "order": 0, /* ςϯϓϨʔτΛద༻͢Δ༏ઌॱҐ */! "settings": {...}, /* γϟʔυͷ਺ͱ͔ɺAnalysis ͷઃఆ౳ΠϯσοΫεఆٛʹؔΘΔઃఆ */! "mappings": {...} /* Ϛοϐϯάఆٛͷઃఆɻdynamic templates͸͜͜Ͱઃఆ */! . . .! }! }! ! ! ! ! ! ϑΝΠϧ໊ͱςϯϓϨʔτ໊͸ͱΓ͋͑ͣಉ͡ʹ͓͚ͯ͠͹ྑ͍͔ͳʁ جຊతͳઃఆ
  20. JOEFYUFNQMBUF {! "base": {! "template": "*",! "order": 1,! "mappings": {!

    "_default_": {! "_timestamp" : {! "enabled" : true,! "store" : true! },! "_analyzer": {! "path": "language"! },! "_id": {! "path": "id"! },! "_source": {! "excludes" : ["attachment_file"]! }! }! }! }! }! UFNQMBUFTCBTFKTPO σϑΥϧτͷΞφϥΠβʔมߋ͢ΔͨΊͷMBOHVBHFϑΟʔϧυͷఆٛͳͲ
  21. JOEFYUFNQMBUF {! "language_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "filter": {! "arabic_stop": {! "type": "stop",! "stopwords": "_arabic_"! },! "arabic_stemmer": {! "type": "stemmer",! "language": "arabic"! },! "armenian_stop": {! "type": "stop",! "stopwords": "_armenian_"! },! "armenian_stemmer": {! "type": "stemmer",! "language": "armenian"! },! "basque_stop": {! "type": "stop",! "stopwords": "_basque_"! },! "basque_stemmer": {! "type": "stemmer",! "language": "basque"! },! "brazilian_stop": {! "type": "stop",! "stopwords": "_brazilian_"! },! "brazilian_stemmer": {! "type": "stemmer",! "language": "brazilian"! },! "bulgarian_stop": {! "type": "stop",! "stopwords": "_bulgarian_"! },! "bulgarian_stemmer": {! "type": "stemmer",! "language": "bulgarian"! },! "catalan_elision": {! UFNQMBUFTMBOHVBHF@BOBMZ[FSTKTPO ֤ࠃݴޠຖͷϑΟϧλʔ΍ΞφϥΠβʔͷఆٛͳͲ
  22. JOEFYUFNQMBUF {! "japanese_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "char_filter": {! "japanese_normalization": {! "type": "kuromoji_iteration_mark",! "normalize_kanji": true,! "normalize_kana": true! }! },! "filter": {! "romaji_readingform": {! "type": "kuromoji_readingform",! "use_romaji": true! },! "katakana_readingform": {! "type": "kuromoji_readingform",! "use_romaji": false! },! "katakana_stemmer": {! "type": "kuromoji_stemmer",! "minimum_length": 4! },! "japanese_stop": {! "type": "kuromoji_part_of_speech"! }! },! "tokenizer": {! "japanese_tokenizer": {! "type": "kuromoji_tokenizer",! "mode": "search"! }! },! "analyzer": {! "default": {! "alias": ["japanese", "ja"],! "type": "custom",! "char_filter": [! "html_strip",! "japanese_normalization"! ],! "tokenizer": "japanese_tokenizer",! "filter": [! "cjk_width",! "lowercase",! UFNQMBUFTKBQBOFTF@BOBMZ[FSTKTPO ೔ຊޠؔ࿈ͷϑΟϧλʔ΍ΞφϥΠβʔͷఆٛͳͲ
  23. JOEFYUFNQMBUF {! "custom_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "filter": {! "domain_extractor" : {! "type" : "pattern_replace",! "preserve_original": false,! "pattern" : "https?://([^/]+).*",! "replacement": "$1"! }! },! "tokenizer": {! "comma_tokenizer": {! "type": "pattern",! "pattern":"[,ɺ]+"! }! },! "analyzer": {! "domain_analyzer" : {! "alias": ["domain"],! "tokenizer" : "uax_url_email",! "filter" : [ "domain_extractor", "lowercase", "unique" ]! },! "comma_analyzer":{! "alias": ["comma"],! "type": "custom",! "tokenizer": "comma_tokenizer",! "filter": ["cjk_width", "lowercase", "trim", "unique"]! },! "keyword_analyzer": {! "type": "custom",! "tokenizer": "keyword",! "filter": ["cjk_width", "lowercase", "trim"]! }! }! }! }! }! } UFNQMBUFTDVTUPN@BOBMZ[FSTKTPO 63-͔ΒυϝΠϯ໊Λநग़ͨ͠Γݴޠʹґଘ͠ͳ͍ΞφϥΠβʔͷఆٛ
  24. JOEFYUFNQMBUF {! "string_fields": {! "template": "*",! "order": 10,! "mappings": {!

    "_default_": {! "dynamic_templates": [! {! "not_analyzed_fields": {! "match": "method|charSet|mimeType| content_type|language",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! "type": "string",! "index": "not_analyzed"! }! }! },! {! "url_fields": {! "match": ".*url|.*link",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! "type": "string",! "index": "not_analyzed",! "fields": {! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "standard"! },! "domain": {! "type": "string",! "analyzer": "domain_analyzer"! }! }! }! }! },! {! "comma_separated_fields": {! "match": "meta_keywords",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! UFNQMBUFTTUSJOH@pFMETKTPO TUSJOHܕϑΟʔϧυͷμΠφϛοΫςϯϓϨʔτఆٛ
  25. JOEFYUFNQMBUF ˝ΠϯσοΫε࡞੒ TFUUJOHTཁૉͷద༻ NBQQJOHTཁૉͷద༻ ˝ϑΟʔϧυσʔλͷ௥Ճ EZOBNJDUFNQMBUFTͷద༻ ΠϨΪϡϥʔͳରԠ͍ͨ͠৔߹ ✔️ ✔️ ✔️

    ର৅ͷϑΟʔϧυఆ͕ٛଘࡏ͠ͳ͍৔ ㅟ ㅟ ㅟ ㅟ ㅟ ㅟ ߹ɺEZOBNJDUFNQMBUFTͷఆٛ͸͜ͷ ㅟ λΠϛϯάͰద༻ɻ ˡ ΠϨΪϡϥʔͳϚοϐϯάఆٛΛ͍ͨ͠ ৔߹͸͜ͷλΠϛϯάͰఆٛ͢Ε͹0, ˝λΠϓͷ࡞੒