import json from openai import OpenAI from transformers import GPT2Tokenizer def openai_complete_if_cache( model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs ) -> str: openai_client = OpenAI() messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) response = openai_client.chat.completions.create( model=model, messages=messages, **kwargs ) return response.choices[0].message.content tokenizer = GPT2Tokenizer.from_pretrained("gpt2") def get_summary(context, tot_tokens=2000): tokens = tokenizer.tokenize(context) half_tokens = tot_tokens // 2 start_tokens = tokens[1000 : 1000 + half_tokens] end_tokens = tokens[-(1000 + half_tokens) : 1000] summary_tokens = start_tokens + end_tokens summary = tokenizer.convert_tokens_to_string(summary_tokens) return summary clses = ["agriculture"] for cls in clses: with open(f"../datasets/unique_contexts/{cls}_unique_contexts.json", mode="r") as f: unique_contexts = json.load(f) summaries = [get_summary(context) for context in unique_contexts] total_description = "\n\n".join(summaries) prompt = f""" Given the following description of a dataset: {total_description} Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. Output the results in the following structure: - User 1: [user description] - Task 1: [task description] - Question 1: - Question 2: - Question 3: - Question 4: - Question 5: - Task 2: [task description] ... - Task 5: [task description] - User 2: [user description] ... - User 5: [user description] ... """ result = openai_complete_if_cache(model="gpt-4o", prompt=prompt) file_path = f"../datasets/questions/{cls}_questions.txt" with open(file_path, "w") as file: file.write(result) print(f"{cls}_questions written to {file_path}")