"question": [ "ՆٳΈԿͰ͔͢ʁ", "࢝ٳՋԿͰ͔͢ʁ" ], # ૯͕४උͨ͠ൣղ "ground_truth": [ "4", "4" ], # LLMͷճ "answer": [ "3Ͱ͢ɻ", "5Ͱ͢ɻ" ], # ϕΫτϧDB͔Βऔಘ্ͨ͠ҐίϯςΩετ "contexts": [ [ "ՆٳΈ7ʙ9݄ͷӦۀͷதͰ4ɺબΜͰऔಘ͠·͢ɻ", "ՆٳΈ༗څͱผ్༩͞Ε·͢ɻ" ], [ "࢝ٳՋ11ʙ1݄ͷӦۀͷதͰ4ɺબΜͰऔಘ͠·͢ɻ", "࢝ٳՋ༗څͱผ్༩͞Ε·͢ɻ" ] ] } dataset = Dataset.from_dict(data_samples) # LLM-as-a-Judgeͱͯ͠ GPT-4oΛར༻ llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o", temperature=0)) # Faithfulness(≒Groundedness), Relevancy, Context PrecisionΛධՁ result = evaluate( dataset, metrics=[ faithfulness, answer_relevancy, context_precision ], llm=llm ) ͳΔ΄Ͳ