Slide 11
Slide 11 text
ϕΫλϥΠζ
● Tokenize By MeCab(ipadic)
● Vectorize By fasttext
(trained by cookpad recipe
title texts)
● 1 sentence → 100 dim
vector
● ͜ΕΛ csv ʹ͠·͢
logger.info("Load latest fasttext")
embeddings_model =
fasttext.FastText.load_model("fasttext.model")
logger.info("Write vectors to CSV.GZ")
with gzip.open(output_file_name, "wb") as file:
for id, segmented_text in zip(ids, segmented_texts):
tokens = segmented_text.split()
embed = np.zeros(embeddings_model.get_dimension())
for token in tokens:
embed += embeddings_model[token]
embed /= len(tokens)
_embed: List[float] = embed.tolist()
_embed.insert(0, id)
file.write(",".join([str(dim) for dim in _embed]).encode("utf-8"))
file.write("\n".encode("utf-8"))