Digdag workflow built by API
Tokenize (Japanese)
SELECT
article_id,
word
FROM
article t1
LATERAL VIEW explode(
tokenize_ja(
normalize_unicode(content, 'NFKC'),
"normal",
array(“a”,”about","above","across","after","again",...),
array(“෭ࢺ”,”ॿࢺ","ಈࢺ","ه߸","໊ࢺ-","෭ࢺ-Ұൠ","ॿࢺ-ಛघ","ಈࢺ-ඌ",...),
"https://s3.amazonaws.com/td-cdp-tagging/stable/kuromoji-user-dict-neologd.csv.gz"
)
) t2 AS word
WHERE
length(word) >= 2
AND word RLIKE '^[͊-ΜʔΝ-ϲʔҰ-ᴱa-zA-Z̰-͉̖-̯ɾʂʁ]+$' -- acceptable characters
AND word NOT RLIKE '^([^Ұ-ᴱ]{1,2}|[͊-Μʔ]{1,3})$' -- even if word consists of acceptable
characters, reject "len-2 non-kanji word" and "len-3 hiragana-only word"