import os import json import time import asyncio import warnings from dotenv import load_dotenv from langchain_community.chat_models import ChatOpenAI from langchain import LLMChain, PromptTemplate import nest_asyncio # Apply the nest_asyncio patch to allow nested event loops nest_asyncio.apply() # Suppress all warnings warnings.filterwarnings("ignore") # Load environment variables load_dotenv() openai_api_key = os.getenv('OPENAI_API_KEY') # Step 1: Load the JSON file synchronously def load_json_file(file_path): with open(file_path, 'r') as json_file: data = json.load(json_file) print('Loaded data') return data # Define an asynchronous function to run Langchain on an individual invention async def process_invention(featureListExtractionChain, key, value): print(f'Generating Feature List for {key}') invention_text = value["invention"] # Process the invention text with the Langchain chain asynchronously result = await featureListExtractionChain.arun({"invention": invention_text}) # Clean the result to remove code block markers and extra formatting cleaned_result = result.replace("```json", "").replace("```", "").strip() # Try to parse the cleaned result as JSON try: parsed_result = json.loads(cleaned_result) except json.JSONDecodeError as e: print(f"Error parsing result for {key}: {e}") parsed_result = cleaned_result # Fallback to raw string if parsing fails # Return the key and result for later collection return key, {"invention": invention_text, "result": parsed_result} # Step 2: Run Langchain prompt on each invention asynchronously async def run_langchain_on_inventions(data, model_name='gpt-4o-mini'): # Prompt template prompt_template = PromptTemplate( input_variables=["invention"], template=""" Break down the provided text, which may be an invention claim or description, into concise technical features using only the exact terms and language from the text. Each feature should capture essential elements without adding any interpretation, rephrasing, or extra context beyond what is explicitly stated. The text may describe either an apparatus, a physical device or structure, or a methodology, which outlines a process with a sequence of steps. For apparatus claims, identify each structural element and its technical role or configuration exactly as stated in the text. For methodology claims, list each step in the exact sequence provided, preserving only the dependencies and terms given. Each feature should be a single, concise sentence that strictly uses only the provided terms. Number features sequentially (F1, F2, etc.) and follow the exact output structure below. Invention: {invention} Output: {{ "F1": "Feature description using only exact terms...", "F2": "Next concise feature with only explicit details..." }} """ ) # Initialize the Langchain LLM with the desired model featureListExtractionChain = LLMChain( llm=ChatOpenAI(model=model_name), prompt=prompt_template ) # Create a list to hold the asynchronous tasks tasks = [] # Create tasks for each invention for key, value in data.items(): tasks.append(process_invention(featureListExtractionChain, key, value)) # Run all the tasks concurrently results = await asyncio.gather(*tasks) # Convert results into a dictionary results_dict = {key: value for key, value in results} return results_dict # Step 3: Save the results to a new JSON file synchronously def save_results_to_json(results, output_file): with open(output_file, 'w') as outfile: json.dump(results, outfile, indent=4) # Main function to tie everything together def main(input_file_path, output_file_path): # Start timing start_time = time.time() # Step 1: Load the JSON file data = load_json_file(input_file_path) if data is None: print("Error: Data not loaded.") return # Step 2: Process the inventions asynchronously using asyncio.run() processed_results = asyncio.run(run_langchain_on_inventions(data)) # Step 3: Save the processed results to a new JSON file save_results_to_json(processed_results, output_file_path) # End timing end_time = time.time() # Calculate and print the total execution time execution_time = end_time - start_time print(f"Script executed in: {execution_time:.2f} seconds") # Run the script as a standalone program if __name__ == "__main__": input_file = 'FTO_inventions.json' # Set the path to your input JSON file output_file = 'FTO_GPT_FeatureList3.json' # Set the path to your output JSON file main(input_file, output_file)