123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- import os
- import json
- import time
- import asyncio
- import warnings
- from dotenv import load_dotenv
- from langchain_community.chat_models import ChatOpenAI
- from langchain import LLMChain, PromptTemplate
- import nest_asyncio
- # Apply the nest_asyncio patch to allow nested event loops
- nest_asyncio.apply()
- # Suppress all warnings
- warnings.filterwarnings("ignore")
- # Load environment variables
- load_dotenv()
- openai_api_key = os.getenv('OPENAI_API_KEY')
- # Step 1: Load the JSON file synchronously
- def load_json_file(file_path):
- with open(file_path, 'r') as json_file:
- data = json.load(json_file)
- print('Loaded data')
- return data
- # Define an asynchronous function to run Langchain on an individual invention
- async def process_invention(featureListExtractionChain, key, value):
- print(f'Generating Feature List for {key}')
- invention_text = value["invention"]
- # Process the invention text with the Langchain chain asynchronously
- result = await featureListExtractionChain.arun({"invention": invention_text})
- # Clean the result to remove code block markers and extra formatting
- cleaned_result = result.replace("```json", "").replace("```", "").strip()
- # Try to parse the cleaned result as JSON
- try:
- parsed_result = json.loads(cleaned_result)
- except json.JSONDecodeError as e:
- print(f"Error parsing result for {key}: {e}")
- parsed_result = cleaned_result # Fallback to raw string if parsing fails
- # Return the key and result for later collection
- return key, {"invention": invention_text, "result": parsed_result}
- # Step 2: Run Langchain prompt on each invention asynchronously
- async def run_langchain_on_inventions(data, model_name='gpt-4o-mini'):
- # Prompt template
- prompt_template = PromptTemplate(
- input_variables=["invention"],
- template="""
- Break down the provided text, which may be an invention claim or description, into concise technical features using only the exact terms and language from the text. Each feature should capture essential elements without adding any interpretation, rephrasing, or extra context beyond what is explicitly stated.
- The text may describe either an apparatus, a physical device or structure, or a methodology, which outlines a process with a sequence of steps.
- For apparatus claims, identify each structural element and its technical role or configuration exactly as stated in the text. For methodology claims, list each step in the exact sequence provided, preserving only the dependencies and terms given.
- Each feature should be a single, concise sentence that strictly uses only the provided terms. Number features sequentially (F1, F2, etc.) and follow the exact output structure below.
- Invention: {invention}
- Output:
- {{
- "F1": "Feature description using only exact terms...",
- "F2": "Next concise feature with only explicit details..."
- }}
- """
- )
-
- # Initialize the Langchain LLM with the desired model
- featureListExtractionChain = LLMChain(
- llm=ChatOpenAI(model=model_name),
- prompt=prompt_template
- )
- # Create a list to hold the asynchronous tasks
- tasks = []
- # Create tasks for each invention
- for key, value in data.items():
- tasks.append(process_invention(featureListExtractionChain, key, value))
- # Run all the tasks concurrently
- results = await asyncio.gather(*tasks)
- # Convert results into a dictionary
- results_dict = {key: value for key, value in results}
- return results_dict
- # Step 3: Save the results to a new JSON file synchronously
- def save_results_to_json(results, output_file):
- with open(output_file, 'w') as outfile:
- json.dump(results, outfile, indent=4)
- # Main function to tie everything together
- def main(input_file_path, output_file_path):
- # Start timing
- start_time = time.time()
- # Step 1: Load the JSON file
- data = load_json_file(input_file_path)
- if data is None:
- print("Error: Data not loaded.")
- return
- # Step 2: Process the inventions asynchronously using asyncio.run()
- processed_results = asyncio.run(run_langchain_on_inventions(data))
- # Step 3: Save the processed results to a new JSON file
- save_results_to_json(processed_results, output_file_path)
- # End timing
- end_time = time.time()
- # Calculate and print the total execution time
- execution_time = end_time - start_time
- print(f"Script executed in: {execution_time:.2f} seconds")
- # Run the script as a standalone program
- if __name__ == "__main__":
- input_file = 'FTO_inventions.json' # Set the path to your input JSON file
- output_file = 'FTO_GPT_FeatureList3.json' # Set the path to your output JSON file
- main(input_file, output_file)
|