for t in issues: result = re.compile('-+').sub('', t) result = re.compile('[0-9]+').sub('0', result) result = re.compile('\s+').sub('', result) # ... ͜ͷΑ͏ͳஔॲཧ͕ෳܨ͕͍ͬͯ·͢ # ࣭ςΩετ͕ۭจࣈʹͳΔ͜ͱ͕͋ΔͷͰͦͷߦؚΊͳ͍Α͏ʹ͠·͢ if len(result) > 0: sub_texts.append(result) filtered_text.append(result) print("text:%s" % result) # text:͓࣌ؒΛଷ͓ͯ͠Γ·͢ɻ
wakati") wakati.parse("") words = wakati.parse(text) # Make word list if words[-1] == u"\n": words = words[:-1] return words texts = [tokenize(a) for a in samples] Β͓࣌ؒ͘Λଷ͓ͯ͠Γ·͢ Β͘ ͓ ࣌ؒ Λ ଷ ͠ ͯ ͓Γ ·͢