ayush
/
FeatureTests


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
							import os
import json
import time
import asyncio
import warnings
from dotenv import load_dotenv
from langchain_community.chat_models import ChatOpenAI
from langchain import LLMChain, PromptTemplate
import nest_asyncio

# Apply the nest_asyncio patch to allow nested event loops
nest_asyncio.apply()

# Suppress all warnings
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

# Step 1: Load the JSON file synchronously
def load_json_file(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        print('Loaded data')
        return data

# Define an asynchronous function to run Langchain on an individual invention
async def process_invention(featureListExtractionChain, key, value):
    print(f'Generating Feature List for {key}')
    invention_text = value["invention"]
    # Process the invention text with the Langchain chain asynchronously
    result = await featureListExtractionChain.arun({"invention": invention_text})

    # Clean the result to remove code block markers and extra formatting
    cleaned_result = result.replace("```json", "").replace("```", "").strip()

    # Try to parse the cleaned result as JSON
    try:
        parsed_result = json.loads(cleaned_result)
    except json.JSONDecodeError as e:
        print(f"Error parsing result for {key}: {e}")
        parsed_result = cleaned_result  # Fallback to raw string if parsing fails

    # Return the key and result for later collection
    return key, {"invention": invention_text, "result": parsed_result}

# Step 2: Run Langchain prompt on each invention asynchronously
async def run_langchain_on_inventions(data, model_name='gpt-4o-mini'):
    # Prompt template
    prompt_template = PromptTemplate(
    input_variables=["invention"],
    template="""
    Break down the provided text, which may be an invention claim or description, into concise technical features using only the exact terms and language from the text. Each feature should capture essential elements without adding any interpretation, rephrasing, or extra context beyond what is explicitly stated.
    The text may describe either an apparatus, a physical device or structure, or a methodology, which outlines a process with a sequence of steps.

    For apparatus claims, identify each structural element and its technical role or configuration exactly as stated in the text. For methodology claims, list each step in the exact sequence provided, preserving only the dependencies and terms given.

    Each feature should be a single, concise sentence that strictly uses only the provided terms. Number features sequentially (F1, F2, etc.) and follow the exact output structure below.

    Invention: {invention}

    Output:
    {{
        "F1": "Feature description using only exact terms...",
        "F2": "Next concise feature with only explicit details..."
    }}

        """


    )
    
    # Initialize the Langchain LLM with the desired model
    featureListExtractionChain = LLMChain(
        llm=ChatOpenAI(model=model_name),
        prompt=prompt_template
    )

    # Create a list to hold the asynchronous tasks
    tasks = []

    # Create tasks for each invention
    for key, value in data.items():
        tasks.append(process_invention(featureListExtractionChain, key, value))

    # Run all the tasks concurrently
    results = await asyncio.gather(*tasks)

    # Convert results into a dictionary
    results_dict = {key: value for key, value in results}
    return results_dict

# Step 3: Save the results to a new JSON file synchronously
def save_results_to_json(results, output_file):
    with open(output_file, 'w') as outfile:
        json.dump(results, outfile, indent=4)

# Main function to tie everything together
def main(input_file_path, output_file_path):
    # Start timing
    start_time = time.time()

    # Step 1: Load the JSON file
    data = load_json_file(input_file_path)

    if data is None:
        print("Error: Data not loaded.")
        return

    # Step 2: Process the inventions asynchronously using asyncio.run()
    processed_results = asyncio.run(run_langchain_on_inventions(data))

    # Step 3: Save the processed results to a new JSON file
    save_results_to_json(processed_results, output_file_path)

    # End timing
    end_time = time.time()

    # Calculate and print the total execution time
    execution_time = end_time - start_time
    print(f"Script executed in: {execution_time:.2f} seconds")

# Run the script as a standalone program
if __name__ == "__main__":
    input_file = 'FTO_inventions.json'  # Set the path to your input JSON file
    output_file = 'FTO_GPT_FeatureList3.json'  # Set the path to your output JSON file
    main(input_file, output_file)