FeatureListAsync.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import os
  2. import json
  3. import time
  4. import asyncio
  5. import warnings
  6. from dotenv import load_dotenv
  7. from langchain_community.chat_models import ChatOpenAI
  8. from langchain import LLMChain, PromptTemplate
  9. import nest_asyncio
  10. # Apply the nest_asyncio patch to allow nested event loops
  11. nest_asyncio.apply()
  12. # Suppress all warnings
  13. warnings.filterwarnings("ignore")
  14. # Load environment variables
  15. load_dotenv()
  16. openai_api_key = os.getenv('OPENAI_API_KEY')
  17. # Step 1: Load the JSON file synchronously
  18. def load_json_file(file_path):
  19. with open(file_path, 'r') as json_file:
  20. data = json.load(json_file)
  21. print('Loaded data')
  22. return data
  23. # Define an asynchronous function to run Langchain on an individual invention
  24. async def process_invention(featureListExtractionChain, key, value):
  25. print(f'Generating Feature List for {key}')
  26. invention_text = value["invention"]
  27. # Process the invention text with the Langchain chain asynchronously
  28. result = await featureListExtractionChain.arun({"invention": invention_text})
  29. # Clean the result to remove code block markers and extra formatting
  30. cleaned_result = result.replace("```json", "").replace("```", "").strip()
  31. # Try to parse the cleaned result as JSON
  32. try:
  33. parsed_result = json.loads(cleaned_result)
  34. except json.JSONDecodeError as e:
  35. print(f"Error parsing result for {key}: {e}")
  36. parsed_result = cleaned_result # Fallback to raw string if parsing fails
  37. # Return the key and result for later collection
  38. return key, {"invention": invention_text, "result": parsed_result}
  39. # Step 2: Run Langchain prompt on each invention asynchronously
  40. async def run_langchain_on_inventions(data, model_name='gpt-4o-mini'):
  41. # Prompt template
  42. prompt_template = PromptTemplate(
  43. input_variables=["invention"],
  44. template="""
  45. Break down the provided text, which may be an invention claim or description, into concise technical features using only the exact terms and language from the text. Each feature should capture essential elements without adding any interpretation, rephrasing, or extra context beyond what is explicitly stated.
  46. The text may describe either an apparatus, a physical device or structure, or a methodology, which outlines a process with a sequence of steps.
  47. For apparatus claims, identify each structural element and its technical role or configuration exactly as stated in the text. For methodology claims, list each step in the exact sequence provided, preserving only the dependencies and terms given.
  48. Each feature should be a single, concise sentence that strictly uses only the provided terms. Number features sequentially (F1, F2, etc.) and follow the exact output structure below.
  49. Invention: {invention}
  50. Output:
  51. {{
  52. "F1": "Feature description using only exact terms...",
  53. "F2": "Next concise feature with only explicit details..."
  54. }}
  55. """
  56. )
  57. # Initialize the Langchain LLM with the desired model
  58. featureListExtractionChain = LLMChain(
  59. llm=ChatOpenAI(model=model_name),
  60. prompt=prompt_template
  61. )
  62. # Create a list to hold the asynchronous tasks
  63. tasks = []
  64. # Create tasks for each invention
  65. for key, value in data.items():
  66. tasks.append(process_invention(featureListExtractionChain, key, value))
  67. # Run all the tasks concurrently
  68. results = await asyncio.gather(*tasks)
  69. # Convert results into a dictionary
  70. results_dict = {key: value for key, value in results}
  71. return results_dict
  72. # Step 3: Save the results to a new JSON file synchronously
  73. def save_results_to_json(results, output_file):
  74. with open(output_file, 'w') as outfile:
  75. json.dump(results, outfile, indent=4)
  76. # Main function to tie everything together
  77. def main(input_file_path, output_file_path):
  78. # Start timing
  79. start_time = time.time()
  80. # Step 1: Load the JSON file
  81. data = load_json_file(input_file_path)
  82. if data is None:
  83. print("Error: Data not loaded.")
  84. return
  85. # Step 2: Process the inventions asynchronously using asyncio.run()
  86. processed_results = asyncio.run(run_langchain_on_inventions(data))
  87. # Step 3: Save the processed results to a new JSON file
  88. save_results_to_json(processed_results, output_file_path)
  89. # End timing
  90. end_time = time.time()
  91. # Calculate and print the total execution time
  92. execution_time = end_time - start_time
  93. print(f"Script executed in: {execution_time:.2f} seconds")
  94. # Run the script as a standalone program
  95. if __name__ == "__main__":
  96. input_file = 'FTO_inventions.json' # Set the path to your input JSON file
  97. output_file = 'FTO_GPT_FeatureList3.json' # Set the path to your output JSON file
  98. main(input_file, output_file)