Slide 11
Slide 11 text
tokenize 後の結果も変わる
● tokenize の結果も変わる。json_decode() を使った方が token 数を抑えられ
そう。
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
prompt_list_jd_token = tokenizer(prompt_list_jd, max_length=500, truncation=True)["input_ids"]
prompt_list_me_token = tokenizer(prompt_list_me, max_length=500, truncation=True)["input_ids"]
print("json_decode:", prompt_list_jd_token)
print("map_element:", prompt_list_me_token)
json_decode: [128000, 32, 4382, 87901, 369, 52845, 512, 72059, 358, 6562, 358, 1436, 13555, 9115, 6847, 1875, 791, 1396, 315, 12197,
304, 1855, 3492, 374, 264, 16099, 315, 52845, 627, 7968, 420, 374, 837, 13]
map_element: [128000, 32, 4382, 87901, 369, 1144, 84, 2839, 66, 15, 7338, 77, 2153, 4438, 358, 6562, 358, 1436, 13555, 9115, 6847,
23041, 77, 1734, 791, 1396, 315, 12197, 304, 1855, 3492, 374, 264, 16099, 315, 1144, 84, 2839, 66, 15, 7255, 77, 7968, 420, 374, 837,
13]
\u03c0
π