CarPlanningProblemGen/llm_qa_direct_only.py at main · AugmentedDesignLab/CarPlanningProblemGen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
## This script evaluates LLM responses when directly answering a question and when answering considering
## the logic of the PDDL file.

import os
import json
import matplotlib.pyplot as plt
import planner # Comment out any function calls within this.
from openai import OpenAI

########### ============  Global initializations ====================== ##########
parsed_file_list = os.listdir("parsed_womdr_data/")
client_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
client_deepseek = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
client_deepinfra = OpenAI(api_key=os.environ["DEEPINFRA_API_KEY"], base_url="https://api.deepinfra.com/v1/openai")
scenario_domain_and_problem_data = planner.retrieve_womdr_domain_problem_data()

# The following are model names for DeepInfra provided models
# "deepseek-ai/DeepSeek-V3"
# "deepseek-ai/DeepSeek-R1" # This model thinks.
# "meta-llama/Llama-3.3-70B-Instruct-Turbo"
# "meta-llama/Meta-Llama-3.1-405B-Instruct"
# "Qwen/Qwen2.5-72B-Instruct"
# "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" # This model thinks.
# "google/gemma-2-9b-it"
# "meta-llama/Meta-Llama-3.1-8B-Instruct"
# "Qwen/Qwen2.5-7B-Instruct"
# "microsoft/phi-4"

# # The following are the small model names for models provided via the OpenAI API service
# "gpt-4o-mini"
# "o3-mini"

model_dictionary = {
   "openai_models": {
       "gpt-4o-mini": []
       },
   "deepinfra_models":{

   }
}

# Generate two lists - domain file list and problem file list for a single scenario
# Reuse code in terms of classes and functions and

model_outputs = {}
existing_grades = {}
scenario_qa_score = {}

######## =================  LLM API calls ====================== ###########
def openai_call(model_name, prompt):
    output = client_oai.chat.completions.create(model=model_name,
                                       messages=[{"role": "user", "content": prompt}],
                                       stream=False
                                    )
    output_content = output.choices[0].message.content
    return output_content

def deepinfra_call(model_name, prompt):
    output = client_deepinfra.chat.completions.create(model=model_name,
                                       messages=[{"role": "user", "content": prompt}],
                                       stream=False
                                    )
    output_content = output.choices[0].message.content
    return output_content

def deepseek_call(model_name, prompt):
    output = client_deepseek.chat.completions.create(model=model_name,
                                       messages=[{"role": "user", "content": prompt}],
                                       stream=False
                                    )
    output_content = output.choices[0].message.content
    return output_content
################# ============== QA prompts =====================
def generate_qa_prompt(context, question, answer, prompt_type="4shot"):
    direct_prompt = f"""
        Here is some information about an autonomous vehicle scenario:
        {context}

        Answer the following question:
        {question}

        Think step by step. Show your reasoning and answer the question.

        """

    direct_cot_prompt_4shot = f"""
    I want you to answer some questions from the world of autonomous vehicle testing.

    Here are some examples of questions being answered:
    First, some information about the context: "Can you describe the current road configuration in terms of lanes? The road has three lanes.What traffic controls are present in the current driving scene? There are no traffic controls present in the current driving scene.What is the ego agent's current velocity? The ego agent's current speed is 6 meters per second.Is the ego agent's speed constant or changing? The ego agent is accelerating.Could you specify the ego agent's current lane position? The ego agent is on the first lane from the right.What is the ego agent's current direction of travel? The ego agent is heading in the same direction as its current lane.What type of agent is surrounding agent #0? Surrounding agent #0 is a vehicle.How fast is surrounding agent #0 moving at the moment? Surrounding agent #0's current speed is 5 meters per second.What is the motion status of surrounding agent #0? Surrounding agent #0 is accelerating.Where is surrounding agent #0 in relation to the ego agent? Surrounding agent #0 is 4 meters on the left and 1 meter in front of the ego agent.What direction is surrounding agent #0 facing compared to the ego agent? Surrounding agent #0 is heading in the same direction as the ego agent.What type of agent is surrounding agent #1? Surrounding agent #1 is a vehicle.What is the current speed of surrounding agent #1? Surrounding agent #1's current speed is 4 meters per second.Is surrounding agent #1 accelerating or maintaining its speed? Surrounding agent #1 is moving at a constant speed.Can you describe the position of surrounding agent #1 relative to the ego agent? Surrounding agent #1 is 24 meters behind and 3 meters on the left of the ego agent.In which direction is surrounding agent #1 moving with respect to the ego agent? Surrounding agent #1 is heading in the same direction as the ego agent.What type of agent is surrounding agent #3? Surrounding agent #3 is a vehicle.What is the current velocity of surrounding agent #3? Surrounding agent #3 is not moving.Where is surrounding agent #3 located in relation to the ego agent? Surrounding agent #3 is 4 meters in front and 4 meters on the right of the ego agent.What direction is surrounding agent #3 facing in relation to the ego agent? Surrounding agent #3 is heading in the same direction as the ego agent.What type of agent is surrounding agent #4? Surrounding agent #4 is a vehicle.What is the motion status of surrounding agent #4? Surrounding agent #4 is not moving.Can you describe the position of surrounding agent #4 with respect to the ego agent? Surrounding agent #4 is 9 meters on the right and 1 meter behind the ego agent.In which direction is surrounding agent #4 heading compared to the ego agent? Surrounding agent #4 is heading the opposite direction as the ego agent.What type of agent is surrounding agent #5? Surrounding agent #5 is a vehicle.Is surrounding agent #5 currently in motion? Surrounding agent #5 is not moving.Where is surrounding agent #5 situated in relation to the ego agent? Surrounding agent #5 is 11 meters in front and 2 meters on the right of the ego agent.What direction is surrounding agent #5 facing with respect to the ego agent? Surrounding agent #5 is heading right of the ego agent.What type of agent is surrounding agent #6? Surrounding agent #6 is a vehicle.What is the current speed of surrounding agent #6? Surrounding agent #6 is not moving.Can you describe the position of surrounding agent #6 relative to the ego agent? Surrounding agent #6 is 14 meters in front and 7 meters on the right of the ego agent.In which direction is surrounding agent #6 moving with respect to the ego agent? Surrounding agent #6 is heading right of the ego agent."

    Question: "What interactions are anticipated between the ego agent and surrounding agent #0?"
    Answer: "Surrounding agent #0 will overtake the ego agent as it is accelerating and will be further ahead in the future."

    Question: "Can you predict the interaction between the ego agent and surrounding agent #4?"
    Answer: "There will be no interaction between the ego agent and surrounding agent #4 as they are heading in opposite directions and not affecting each other's path."

    Question: "What is the ego agent's plan for the immediate future?"
    Answer: "The ego agent intends to continue on its current path and lane while accelerating. It will overtake surrounding agent #3 and pass surrounding agents #5 and #6, as they are not moving. It will also be overtaken by surrounding agent #0, which is accelerating on the left side."

    Question: "What will be the nature of the interaction between the ego agent and surrounding agent #6?"
    Answer: "The ego agent will pass surrounding agent #6 since surrounding agent #6 is stationary and the ego agent is accelerating."

    Given these examples now please have a look at the following new context and try to answer the following question:
    Here is the context: {context}

    Here is the question: {question}

    """
    if prompt_type=="4shot":
        return direct_cot_prompt_4shot
    elif prompt_type=="direct":
        return direct_prompt


################# ============= Grading via LLM as a judge prompts ================== ###############
def prepare_grading_prompt(context, question, answer, model_output):
    grading_prompt = f"""
        Here is some context about the test scenario:
        {context}

        This question was asked with regards to this context:
        {question}

        This is the ground truth answer:
        {answer}

        This was the attempt by an AI for this question
        {model_output}

        Grade this answer on the following aspects:
        1. The correctness of the AI answer with respect to the ground truth answer. Give it a score between 1 to 10.
        Explain why this score was given by you in detail.
        2. The faithfulness of the reasoning. Are the conclusions drawn in the answer given by the AI consistent with its reasoning? Here, give it a score between 1 to 10.
        Explain why this score was given by you in detail.

        Format the answer in a python dictionary format like this.
        <open curly bracket>:
        "Correctness score": "<Only enter the score number here>",
        "Correctness explanation": "<Write your explanation here>",
        "Faithfulness score": "<Only enter the score number here>",
        "Faithfulness explanation": "<Write your explanation here>",
        <close curly bracket>

        Don't write anything else. Nothing else, nothing else, nothing else.
        Please only write it in the format requested.
        """
    return grading_prompt

############### =============== Evaluating Interactions ================ ##############
def grade_openai_deepinfra_models_one_interaction(model_dictionary,
                                                  existing_grades,
                                                  scenario_id,
                                                  interaction_id,
                                                  prompt_type):

    #### Step 1: Generate the PDDL prompts ======================= #########
    context = scenario_domain_and_problem_data[scenario_id]["Context"]
    question = scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["problem_data"]
    answer = scenario_domain_and_problem_data[scenario_id]["Interactions"][interaction_id]["answer_data"]

    generated_prompt = generate_qa_prompt(context, question, answer, prompt_type)
    #### Step 2: Generate the model grades and add them to the dictionary

    for model_family in model_dictionary.keys():
        if model_family=="openai_models":
            for model_name in model_dictionary[model_family]:
                grading_prompt = prepare_grading_prompt(context=context, question=question,
                                       answer=answer, model_output=openai_call(model_name=model_name, prompt=generated_prompt))
                grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt))
                existing_grades[scenario_id][interaction_id].setdefault(
                    model_family+"_"+model_name+"_modelname", grading_output
                    )
                avg_score = (int(grading_output["Correctness score"]) + int(grading_output["Faithfulness score"]))/2
                existing_grades[scenario_id][interaction_id][model_family+"_"+model_name+"_modelname"].setdefault("problem_score_avg", (str(avg_score)))
                model_dictionary[model_family][model_name].append(avg_score)
        elif model_family=="deepinfra_models":
            for model_name in model_dictionary[model_family]:
                grading_prompt = prepare_grading_prompt(context=context, question=question,
                                       answer=answer, model_output=deepinfra_call(model_name=model_name, prompt=generated_prompt))
                grading_output = eval(deepinfra_call(model_name="deepseek-ai/DeepSeek-V3", prompt=grading_prompt))
                existing_grades[scenario_id][interaction_id].setdefault(
                    model_family+"_"+model_name+"_modelname", grading_output
                    )
                avg_score = (int(grading_output["Correctness score"]) + int(grading_output["Faithfulness score"]))/2
                existing_grades[scenario_id][interaction_id][model_family+"_"+model_name+"_modelname"].setdefault("problem_score_avg", (str(avg_score)))
                model_dictionary[model_family][model_name].append(avg_score)


def pddl_response_and_answer_questions(prompt_type="4shot"):
    # Parse through the preprocessed json data contained in parsed_womdr_data/
    for scenario_id in scenario_domain_and_problem_data.keys():
        existing_grades.setdefault(scenario_id, {})
        for interaction_id in scenario_domain_and_problem_data[scenario_id]["Interactions"].keys():
            existing_grades[scenario_id].setdefault(interaction_id, {})

            ##### ===================== Automatic model evaluation with LLM grades on outputs ============== #########
            grade_openai_deepinfra_models_one_interaction(model_dictionary=model_dictionary,
                                                        existing_grades=existing_grades,
                                                        scenario_id=scenario_id,
                                                        interaction_id=interaction_id,
                                                        prompt_type=prompt_type)

            #Ensure that this json file by the name grades/deepseek_grades.json exists first.
    with open("grades/direct/deepseek_grades_direct_"+prompt_type+".json", 'w') as grade_file:
        print("Existing grades is given by {}".format(existing_grades))
        json.dump(existing_grades, grade_file, indent=4)
        grade_file.close()

def main():

    # Change parameter here depending on the prompt.
    prompt_type = "direct"
    pddl_response_and_answer_questions(prompt_type=prompt_type)
    for model_provider in model_dictionary.keys():
        for model in model_dictionary[model_provider].keys():
            plt.bar([i for i in range(len(model_dictionary[model_provider][model]))], model_dictionary[model_provider][model])
            plt.show()
main()