= 0.95, dtype = "half", enforce_eager = True, max_model_len = MAX_LENGTH, ) tokenizer = llm.get_tokenizer() sampling_params = vllm.SamplingParams( top_p = 0.9, temperature = 0, seed = 777, skip_special_tokens = True, max_tokens = 1, ) responses = model.generate( prompts, sampling_params = sampling_params, ) コードもシンプル モデルのロード tokenizer の取得 推論 パラメータ設定