### Import the Libraries

In [2]:
#%reload_ext autoreload
#%autoreload 
from requests import Session
import sys
import pandas as pd
from tqdm import tqdm
from iorbit_client import IOrbitClient

import logging
logging.basicConfig(level=logging.INFO)
import json
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import os
import tiktoken
import re
from collections import defaultdict

In [3]:
# Load the .env file
load_dotenv()

# Access the API keys
openai_api_key = os.getenv('OPENAI_API_KEY')

### Loading the feature list and the patents for feature table creation

In [4]:
# Load the separate JSON files
with open("C:/Work/Qatent/Codes/TestCodes/FeatureTable/FeatureTable/IVS/Sample - Search Report 006 - IVS_006_features.json", "r") as f:
    features = json.load(f)

with open('C:/Work/Qatent/Codes/TestCodes/FeatureTable/FeatureTable/IVS6.json', "r") as f:
    concurIP_patents = json.load(f)

In [5]:
print(type(features))
print(type(concurIP_patents))

<class 'dict'>
<class 'dict'>


In [15]:
features

{'F1': 'A porous plug for use in a substrate support,',
 'F2': 'comprising: a porous central passageway; and',
 'F3': 'a solid outer shell bonded to and surrounding the porous central passageway such that there is no continuous gap between the porous central passageway and the solid outer shell along an entire length of the porous plug',
 'F4': 'wherein the solid outer shell includes sealing surfaces disposed on ends of the solid outer shell to facilitate forming a seal along the sealing surface and surrounding the porous central passageway.'}

In [None]:
#The concurIP_patents file is a dict of dicts

"""

{
    "2018WO-CN100373": {
        "Application_Number": "2018WO-CN100373",
        "Content": {
            "title": "Exhaust after-treatment device",
            "pa01": "The invention discloses .................",
            "c-en-0001": "An exhaust aftertreatment device comprising a ....",
            "c-en-0002": "The exhaust aftertreatment device of claim 1, wherein: the exhaust aftertreatment device is U-shaped.",
            "c-en-0003": "The e
            .
            .
            .
            "p0048": "It should be noted that the orientat.................."
        }
    },

    "2020WO-CN117105": {
        "Application_Number": "2020WO-CN117105",
        "Content": {
            "title": "Tail gas after-treatment shell and tail gas after-treatment sealing package",
            "pa01": "The invention discloses a................",
            "c-en-0001": "The utility model provides a ............................",
            "c-en-0002": "The exhaust aftertreatment housing of claim 1, wherein: .........",

            "p0107": "Referring to.............................",
            "p0108": "Continuing to .......................",
            "p0109": "Continuing to refer ............................."
        }
    }
}


"""

In [6]:
# Convert the feature list to a readable format for the model, assuming it’s a dictionary
features_text = "\n".join([f"{feature_id}: {desc}" for feature_id, desc in features.items()])
# Remove curved brackets with numbers in them
features_text = re.sub(r"\(\d+\)", "", features_text)
print(f'Feature List as a string:\n{features_text}')


# Create a new variable for just the feature descriptions for Orbit Search
features_only_text = "\n".join([f"{desc}" for desc in features.values()])
# Remove curved brackets with numbers in them
features_only_text = re.sub(r"\(\d+\)", "", features_only_text)
print(f'\nFeatures Text for Orbit search:\n{features_only_text}')

Feature List as a string:
F1: A porous plug for use in a substrate support,
F2: comprising: a porous central passageway; and
F3: a solid outer shell bonded to and surrounding the porous central passageway such that there is no continuous gap between the porous central passageway and the solid outer shell along an entire length of the porous plug
F4: wherein the solid outer shell includes sealing surfaces disposed on ends of the solid outer shell to facilitate forming a seal along the sealing surface and surrounding the porous central passageway.

Features Text for Orbit search:
A porous plug for use in a substrate support,
comprising: a porous central passageway; and
a solid outer shell bonded to and surrounding the porous central passageway such that there is no continuous gap between the porous central passageway and the solid outer shell along an entire length of the porous plug
wherein the solid outer shell includes sealing surfaces disposed on ends of the solid outer shell to faci

#### Logging into orbit (Optional)

In [8]:
import getpass
email = 'anshul.mahawar@concurip.com'
if not 'password' in globals():
    password = getpass.getpass("Enter your password: ")

c = IOrbitClient(email, password, host='betaqs.orbit.com', database='FAMPAT')
c.login()

{'ticket': 'b54acc8e-22b4-4b26-9e30-8003e186d5a5',
 'accountErpGuid': '0F99E464-EC39-476D-BBDA-8E2B651369DB',
 'status': 'ok',
 'message': '',
 'email': 'anshul.mahawar@concurip.com',
 'firstname': '',
 'lastname': '',
 'role': 'user',
 'shard': 'betaqs.orbit.com',
 'orbitServices': {'version': '2.0.0',
  'uuid': '4a567ef9-da0f-4e44-9dcb-d8932151596a154235944821712FR-DC1',
  'entity_id': '',
  'firstname': '',
  'lastname': '',
  'email': 'anshul.mahawar@concurip.com',
  'erp_guuid': '0F99E464-EC39-476D-BBDA-8E2B651369DB',
  'yoomap': [],
  'invention_url': '',
  'services': [{'name': 'X_WKF_RW'},
   {'name': 'X_GRAPHS'},
   {'name': 'X_EXPORTS'},
   {'name': 'X_SEARCH_RW'}]},
 'yoomap': []}

In [9]:
# Step 6: Use extracted features as search query and fetch 7 patents from Orbit API
api_response = c.semantic_search(features_only_text, ['TI', 'AB', 'ECLM', 'DESC'], count=5)

# Convert the response to a DataFrame for easier processing
df_orbit = pd.DataFrame(api_response['documents'])
df_orbit


INFO:root:semantic searching https://betaqs.orbit.com/rest/iorbit/user/semantic/FAMPAT;ticket=b54acc8e-22b4-4b26-9e30-8003e186d5a5 with {'range': '1-5', 'text': 'A porous plug for use in a substrate support,\ncomprising: a porous central passageway; and\na solid outer shell bonded to and surrounding the porous central passageway such that there is no continuous gap between the porous central passageway and the solid outer shell along an entire length of the porous plug\nwherein the solid outer shell includes sealing surfaces disposed on ends of the solid outer shell to facilitate forming a seal along the sealing surface and surrounding the porous central passageway.', 'fields': 'TI AB ECLM DESC'}


Unnamed: 0,AB,TI,RELEVANCE_SCORE,ID,REP,DESC,ECLM
0,(EP4441780)<br/>Porous plugs for gas delivery ...,Porous plug for electrostatic chuck gas delivery,100.0,EP4441780_A1,EP4441780_A1,"<p><h1>FIELD</h1></p><p><span class=""paragraph...",(WO2023/096756)<br/><p><heading>Claims: </head...
1,(EP0412229)<br/>A rotary valve assembly having...,Thermally stable sealing device for a butterfl...,93.0,EP-412229_A1,EP-412229_A1,<p><h1>BACKGROUND OF THE INVENTION</h1></p><p>...,(EP0412229)<br/><p>1. Temperature-resistant bi...
2,(DE102019112957)<br/>A sealing unit 1 comprise...,Annular seal for spark plug or the like,93.0,DE102019112957_A1,DE102019112957_A1,"<p><heading level=""1"">Field of technology</hea...",(GB2574032)<br/><p>1. A seal (1) comprising an...
3,(US20040200715)<br/>The present invention feat...,Dynamic flange seal and sealing system,93.0,US20040200715_A1,US20040200715_A1,"<p><h1 align=""LEFT"">RELATED APPLICATIONS </h1>...","(US20040200715)<br/><p><h1 align=""LEFT"">What i..."
4,(WO2018/079632)<br/>Provided are a seal struct...,"Seal structure, sealing method, and coupling e...",93.0,WO201879632_A1,WO201879632_A1,<p><h1>CROSS REFERENCE TO RELATED APPLICATIONS...,(US20190293178)<br/><p><b>1</b>. A seal struct...


In [10]:
df = df_orbit[['ID' ,'TI', 'AB', 'ECLM', 'DESC']]
df

Unnamed: 0,ID,TI,AB,ECLM,DESC
0,EP4441780_A1,Porous plug for electrostatic chuck gas delivery,(EP4441780)<br/>Porous plugs for gas delivery ...,(WO2023/096756)<br/><p><heading>Claims: </head...,"<p><h1>FIELD</h1></p><p><span class=""paragraph..."
1,EP-412229_A1,Thermally stable sealing device for a butterfl...,(EP0412229)<br/>A rotary valve assembly having...,(EP0412229)<br/><p>1. Temperature-resistant bi...,<p><h1>BACKGROUND OF THE INVENTION</h1></p><p>...
2,DE102019112957_A1,Annular seal for spark plug or the like,(DE102019112957)<br/>A sealing unit 1 comprise...,(GB2574032)<br/><p>1. A seal (1) comprising an...,"<p><heading level=""1"">Field of technology</hea..."
3,US20040200715_A1,Dynamic flange seal and sealing system,(US20040200715)<br/>The present invention feat...,"(US20040200715)<br/><p><h1 align=""LEFT"">What i...","<p><h1 align=""LEFT"">RELATED APPLICATIONS </h1>..."
4,WO201879632_A1,"Seal structure, sealing method, and coupling e...",(WO2018/079632)<br/>Provided are a seal struct...,(US20190293178)<br/><p><b>1</b>. A seal struct...,<p><h1>CROSS REFERENCE TO RELATED APPLICATIONS...


#### Clean the orbit data 

In [12]:
def clean_abstract(text):
    # Step 1: Remove initial text up to and including the first `>`
    cleaned_text = text.split('>', 1)[-1].strip()
    
    # Step 2: Remove any numbers within round brackets, e.g., "(10)"
    cleaned_text = re.sub(r"\(\d+\)", "", cleaned_text)
    
    # Final cleaning to remove extra whitespace
    return cleaned_text.strip()

def clean_claim_text(text):
    # Step 1: Remove the initial patent number and any tags that follow it, such as <br/> and <p>
    text = re.sub(r"^\(.*?\)<br/><p>", "", text)
    
    # Step 2: Remove all content within <h1> tags or <heading> tags (and the tags themselves)
    text = re.sub(r"<h1>.*?</h1>|<heading>.*?</heading>", "", text, flags=re.DOTALL)
    
    # Step 3: Insert a newline before any claim number following `</p><p>`
    text = re.sub(r"</p><p>(\d+\.)", r"\n\1", text)
    
    # Step 4: Remove all remaining HTML-like tags
    text = re.sub(r"<[^>]+>", "", text)
    
    # Step 5: Remove round brackets and their contents
    text = re.sub(r"\([^)]*\)", "", text)
    
    # Final cleanup to remove any extraneous whitespace
    return text.strip()


def clean_description_text(text):
    # Step 1: Remove specific headings like "CROSS REFERENCE TO RELATED APPLICATION" within <h1> tags
    # Adjusted regex to match specific <h1> sections and remove the entire section if it matches
    cleaned_text = re.sub(r"<p><h1>.*?RELATED APPLICATION.*?</h1></p>", "", text, flags=re.DOTALL | re.IGNORECASE)
    
    # Step 2: Remove other tags like <p>, </p>, <br/>, <ul>, <li>, </li>, and </ul>
    cleaned_text = re.sub(r"</?p>|<br/>|</?ul>|</?li>", "", cleaned_text, flags=re.DOTALL)

    # Step 3: Remove all instances of "FIG.", "fig.", followed by numbers
    cleaned_text = re.sub(r"\bFIG\.?\s*\d+|\bfig\.?\s*\d+", "", cleaned_text, flags=re.IGNORECASE).strip()

    # Step 4: Replace paragraph numbers with a newline and keep the paragraph number as text
    cleaned_text = re.sub(r'<span class="paragraph-number">\[(\d{4})\]\s*</span>', r"\n[\1]", cleaned_text)

    # Step 5: Remove standalone numbers, excluding those followed by '%' or letters
    cleaned_text = re.sub(r"\b\d+\b(?![%a-zA-Z])", "", cleaned_text)

    # Step 6: Remove empty square brackets created without a number
    cleaned_text = re.sub(r"\[\]", "", cleaned_text).strip()

    # Step 7: Remove empty parentheses "()"
    cleaned_text = re.sub(r"\(\s*\)", "", cleaned_text).strip()

    # Step 8: Remove any remaining HTML tags, including those with attributes
    cleaned_text = re.sub(r"<[^>]+>", "", cleaned_text).strip()

    # Step 9: Add a line number at the start of each new line
    lines = cleaned_text.splitlines()  # Split text by lines
    cleaned_text = "\n".join(f"{i + 1}. {line.strip()}" for i, line in enumerate(lines) if line.strip())

    return cleaned_text
    

# Apply the cleaning function to the 'ECLM' column in the DataFrame
df['AB'] = df['AB'].apply(clean_abstract)
df['ECLM'] = df['ECLM'].apply(clean_claim_text)
df['DESC'] = df['DESC'].apply(clean_description_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AB'] = df['AB'].apply(clean_abstract)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ECLM'] = df['ECLM'].apply(clean_claim_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DESC'] = df['DESC'].apply(clean_description_text)


In [13]:
df

Unnamed: 0,ID,TI,AB,ECLM,DESC
0,EP4441780_A1,Porous plug for electrostatic chuck gas delivery,Porous plugs for gas delivery in substrate sup...,1. A porous plug for use in a substrate suppor...,1. FIELD\n2. Embodiments of the present disclo...
1,EP-412229_A1,Thermally stable sealing device for a butterfl...,"A rotary valve assembly having a housing, a va...",1. Temperature-resistant bidirectional sealing...,1. BACKGROUND OF THE INVENTIONThis invention r...
2,DE102019112957_A1,Annular seal for spark plug or the like,A sealing unit 1 comprises annular first and s...,1. A seal comprising an annular first seal el...,1. Field of technology\n2. This disclosure rel...
3,US20040200715_A1,Dynamic flange seal and sealing system,The present invention features a flange to fla...,What is claimed and desired to be secured by L...,1. RELATED APPLICATIONS\n2. This application c...
4,WO201879632_A1,"Seal structure, sealing method, and coupling e...",Provided are a seal structure and sealing meth...,1. A seal structure provided at a butt portion...,1. This application is related to co-pending a...


#### Structure the orbit data into the json dict of dicts format 

In [24]:
orbit_json = {}
for _, row in df.iterrows():
    application_id = row['ID']  # Assuming 'ID' is the Application ID
    content = row.drop(['ID', 'ECLM', 'DESC']).to_dict()  # Drop 'ID', 'ECLM', and 'DESC' columns for base content
    
    # Process ECLM claims into c-en-000x keys
    claims = re.split(r"\s*(\d+)\.\s+", row['ECLM'])
    claim_dict = {}
    for i in range(1, len(claims) - 1, 2):
        claim_number = int(claims[i])
        claim_text = claims[i + 1]
        claim_key = f"c-en-{claim_number:04d}"  # Format claim number as c-en-000x
        claim_dict[claim_key] = claim_text.strip()
    
    # Process DESC paragraphs based on numbered paragraphs (e.g., "1.", "2.", etc.)
    paragraphs = re.split(r"\s*(\d+)\.\s+", row['DESC'])
    paragraph_dict = {}
    for i in range(1, len(paragraphs) - 1, 2):
        para_number = int(paragraphs[i])
        para_text = paragraphs[i + 1]
        para_key = f"p{para_number:04d}"  # Format paragraph number as p000x
        paragraph_dict[para_key] = para_text.strip()

    # Update content dictionary with claims and paragraphs
    content.update(claim_dict)
    content.update(paragraph_dict)
    
    # Add to main JSON structure with application ID as top-level key
    orbit_json[application_id] = {"Patent Num": application_id, "content": content}

# Display as JSON formatted string
json_output = json.dumps(orbit_json, indent=2)

# Optionally, save to a JSON file
output_file_path = "structured_orbit.json"
with open(output_file_path, "w") as file:
    json.dump(orbit_json, file, indent=2)

print(f"\nStructured JSON with claims and paragraphs saved to '{output_file_path}'")


Structured JSON with claims and paragraphs saved to 'structured_orbit.json'


In [16]:
print(type(orbit_json))
print(type(concurIP_patents))

<class 'dict'>
<class 'dict'>


In [17]:
print(len(orbit_json))
print(len(concurIP_patents))

5
4


In [18]:
# Convert the dictionary to a JSON string in a readable format
search_patent_str = json.dumps(concurIP_patents, ensure_ascii=False, indent=2)

search_patent_str

'{\n  "US10964579": {\n    "Patent Num": "US10964579",\n    "Content": {\n      "title": "Electrostatic chuck",\n      "pa01": "According to the embodiment, an electrostatic chuck includes a ceramic dielectric substrate, a base plate, and a porous part. The ceramic dielectric substrate has a first major surface placing a suction object, a second major surface on an opposite side to the first major surface, and a through hole provided from the second to first major surface. The base plate supports the ceramic dielectric substrate and includes a gas introduction path communicating with the through hole. The porous part is provided in the gas introduction path. The porous part includes sparse portions including pores and a dense portion having a higher density than the sparse portions. Each of the sparse portions extends in a first direction from the base plate toward the ceramic dielectric substrate. The dense portion is positioned between the sparse portions. The sparse portions include

#### Check token size, should be ok now that we are sending requests parallelly with async rather than all 7-8 patents at once

In [19]:
# Load the tokenizer for the GPT-4 model
tokenizer = tiktoken.encoding_for_model("gpt-4o")

# Calculate the number of tokens in the `search_patent_str`
tokens_patent = tokenizer.encode(search_patent_str)
num_tokens_patent = len(tokens_patent)

tokens_featuelist = tokenizer.encode(features_text)
num_tokens_list = len(tokens_featuelist)

# Print the token count
print("Token count for `search_patent_str`:", num_tokens_patent)
print("Token count for `feature_text`:", num_tokens_list)

Token count for `search_patent_str`: 32943
Token count for `feature_text`: 105


In [22]:
print(concurIP_patents.keys())
len(concurIP_patents)

dict_keys(['US10964579', 'US20190371578', 'US20200373184', 'US9767993'])


4

In [21]:
print(orbit_json.keys())
len(orbit_json)

dict_keys(['EP4441780_A1', 'EP-412229_A1', 'DE102019112957_A1', 'US20040200715_A1', 'WO201879632_A1'])


5

In [None]:
# Combine both ConcurIP and Orbit semantic search results
combined_patents = orbit_json.copy()
combined_patents.update(concurIP_patents)

# Save the combined_patents data to a new file
output_file_path = "combined_patents.json"
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(combined_patents, outfile, ensure_ascii=False, indent=4)

print(f"combined_patents JSON saved to '{output_file_path}'.")

combined_patents JSON saved to 'combined_patents.json'.


In [32]:
print(combined_patents.keys())
len(combined_patents)

dict_keys(['EP4441780_A1', 'EP-412229_A1', 'DE102019112957_A1', 'US20040200715_A1', 'WO201879632_A1', 'US10964579', 'US20190371578', 'US20200373184', 'US9767993'])


9

In [33]:
# Define the prompt template with placeholders for features and search patent data
prompt_template = PromptTemplate(
    template="""
    You are a patent attorney with expertise in analyzing patent documents. You have been provided with a list of extracted features and one search patent. Your task is to analyze the text and classify each feature based on its alignment with the patent, following these steps:

    ---

    ### Classification Guidelines:

    1. **Search the patent text thoroughly** for instances of the exact feature description. If an exact match is not found:
       - Look for related phrases, synonyms, or functional descriptions that convey the **same intent, purpose, or outcome**, even if phrasing or implementation differs.
       - Focus on whether the patent achieves the **same goal or functionality** described in the feature, even if achieved through alternate methods.

    2. Classify each feature as:
       - **Present**: The feature or its equivalent is clearly described in the patent. This includes cases where:
           - The feature’s intent and functionality are fully captured, even if phrasing, terminology, or supporting details differ.
           - The patent text describes elements or mechanisms that **imply or achieve the feature’s intent**, even without explicitly matching the phrasing.
       - **Partially Present**: Reserve this classification for cases where:
           - Key aspects of the feature’s intent or functionality are missing or described inconsistently.
           - The feature’s purpose is only partially met due to significant differences in structure, methodology, or implementation.
       - **Not Present**: Use this classification only when the patent text has **no meaningful alignment** with the feature’s intent, purpose, or functionality, even indirectly.

    3. When determining **Present** vs. **Partially Present**:
       - Lean toward **Present** if the patent text strongly aligns with the feature’s purpose or functionality, even if phrasing or minor details differ.
       - Reserve **Partially Present** for cases where critical components or aspects of functionality are missing, and the alignment is incomplete.
       - Use **Not Present** sparingly, only when there is no relevant overlap or alignment.

    4. **Avoid Defaulting to "No"**:
       - Do not classify features as **"Not Present"** simply because phrasing or terminology differs.
       - Prioritize intent and functionality over exact phrasing.

    5. Provide a detailed explanation for each classification:
       - For **Present**: Highlight how the description aligns with the feature’s intent, explaining overlaps and why any minor differences do not affect the classification.
       - For **Partially Present**: Clearly explain the overlaps and differences, focusing on why the alignment is incomplete.
       - For **Not Present**: Justify why the feature cannot be found, citing any tangentially related text if applicable.

    ---

    ### General Example of Analysis:

    - **Feature**: "A system includes a component that performs a specific function to achieve a particular outcome."
      - **Present**: "The patent describes a component that achieves the specified function, fully aligning with the feature's intent and purpose."
      - **Partially Present**: "The patent describes related functionality, but critical details or key elements of the feature are missing."
      - **Not Present**: "The patent does not describe any component or functionality resembling the feature or achieving a similar outcome."

    ---

    Features:
    {features}

    Patent:
    {patent}

    Strictly Follow the Output JSON Format:
    {{
        "Patent_num": "Patent Num",
        "feature_table": [
            {{
                "feature_id": "Feature ID",
                "feature": "Feature description",
                "present": "Yes/No/Partially",
                "comment": "Detailed explanation of why the feature is classified as present, partially present, or not present. For partial matches, describe overlaps, differences, and related concepts.",
                "most_relevant_paragraph_ids": ["p000x", "c-en-000x"]
            }} 
        ]
    }}
    """,
    input_variables=["features", "patent"]
)

# Initialize the OpenAI LLM for GPT-4o
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Set up the LangChain with LLM and prompt
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)


#### Run the prompt chain on each patent using async

In [None]:
import json
import asyncio
import nest_asyncio

# Allow nested async loops
nest_asyncio.apply()

# Define an async function for processing each patent
async def process_patent(patent_id, patent_data, features_text, llm_chain):
    search_patent_str = json.dumps(patent_data, ensure_ascii=False, indent=2)
    response = await llm_chain.arun(features=features_text, patent=search_patent_str)
    cleaned_result = response.replace("```json", "").replace("```", "").strip()
    try:
        parsed_response = json.loads(cleaned_result)
        return {"patent_id": patent_id, "response": parsed_response}
    except json.JSONDecodeError:
        print(f"Failed to parse response for patent '{patent_id}'.")
        return None

# Main async function to gather all tasks
async def main(concurIP_patents, features_text, llm_chain):                     # Replace concurIP_patents with orbit_json or combined_patents
    tasks = [process_patent(patent_id, patent_data, features_text, llm_chain)
             for patent_id, patent_data in concurIP_patents.items()]            # Replace concurIP_patents with orbit_json or combined_patents
                        
    results = await asyncio.gather(*tasks)
    formatted_responses = {result["patent_id"]: result["response"] for result in results if result is not None}
    return formatted_responses

# Run the async function
responses = asyncio.run(main(concurIP_patents, features_text, llm_chain))        # Replace concurIP_patents with orbit_json or combined_patents

# Save the responses to a JSON file
output_file_path = "Results.json"
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(responses, f, ensure_ascii=False, indent=2)

print(f"Formatted responses saved to '{output_file_path}' for viewing.")


INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=8292 request_id=req_5c91641c45f37bb80d647dc749ba740c response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=8369 request_id=req_b83419f051f8907fb361b7af98c5b9fe response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=9097 request_id=req_d2152f2be6f128d8f9b9908004794254 response_code=200
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=9764 request_id=req_627e471d23af7bcd60984aa34b3e53a3 response_code=200


Formatted responses saved to 'Results.json' for viewing.


In [28]:
# Create a dictionary to store features and their "present" status per patent
feature_dict = {}

# Iterate through the JSON data
for patent_id, patent_data in responses.items():
    for feature in patent_data["feature_table"]:
        feature_id = feature["feature_id"]
        present_status = feature["present"]
        
        # Add to the dictionary
        if feature_id not in feature_dict:
            feature_dict[feature_id] = {}
        feature_dict[feature_id][patent_id] = present_status

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(feature_dict, orient="index")

# Display the DataFrame
print("Feature DataFrame:")
print(df)

Feature DataFrame:
     US10964579 US20190371578 US20200373184    US9767993
F1          Yes           Yes           Yes    Partially
F2          Yes     Partially     Partially    Partially
F3    Partially            No            No  Not Present
F4  Not Present            No     Partially  Not Present
