GitHub - TonicAI/tvallogging: A tool for evaluating and tracking your RAG experiments. This repo contains the Python SDK for logging to Tonic Validate.

import os
# to set the environment variables via python
os.environ["OPENAI_API_KEY"] = "put-your-openai-api-key-here"
os.environ["TONIC_VALIDATE_API_KEY"] = "put-your-tonic-validate-api-key-here"

from tvallogging.api import TonicValidateApi
from tvallogging.chat_objects import Benchmark

project_name: str # name of your new project
benchmark_name: str # name of your new benchmark
# list of dictionaries of the form
# {
#     "question": "question for the benchmark",
#      "answer": "reference answer to the question"
# }
question_with_answer_list: List[Dict[str, str]]

api = TonicValidateApi()

benchmark = Benchmark.from_json_list(question_with_answer_list)
benchmark_id = api.new_benchmark(benchmark, benchmark_name)

project = api.new_project(benchmark_id, project_name)

llm_evaluator = "gpt-4"
run = project.new_run(llm_evaluator)

for question_with_answer in run.benchmark.question_with_answer_list:

    question = question_with_answer.question

    llm_answer: str # answer obtained from the RAG application
    retrieved_context_list: List[str] # list of context retrieved by the RAG application

    # log the llm_answer and retrieved_context_list to Tonic Validate
    # in this step, the RAG metrics are calculated locally
    run.log(question_with_answer, llm_answer, retrieved_context_list)