{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import the Libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#%reload_ext autoreload\n",
"#%autoreload \n",
"from requests import Session\n",
"import sys\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from iorbit_client import IOrbitClient\n",
"\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)\n",
"import json\n",
"from langchain_community.chat_models import ChatOpenAI\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.chains import LLMChain\n",
"from dotenv import load_dotenv\n",
"import os\n",
"import tiktoken\n",
"import re\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Load the .env file\n",
"load_dotenv()\n",
"\n",
"# Access the API keys\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loading the feature list and the patents for feature table creation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Load the separate JSON files\n",
"with open(\"C:/Work/Qatent/Codes/TestCodes/FeatureTable/FeatureTable/IVS/Sample - Search Report 006 - IVS_006_features.json\", \"r\") as f:\n",
" features = json.load(f)\n",
"\n",
"with open('C:/Work/Qatent/Codes/TestCodes/FeatureTable/FeatureTable/IVS6.json', \"r\") as f:\n",
" concurIP_patents = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" \n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" AB \n",
" TI \n",
" RELEVANCE_SCORE \n",
" ID \n",
" REP \n",
" DESC \n",
" ECLM \n",
" \n",
" \n",
" 0 \n",
" (EP4441780)<br/>Porous plugs for gas delivery ... \n",
" Porous plug for electrostatic chuck gas delivery \n",
" 100.0 \n",
" EP4441780_A1 \n",
" EP4441780_A1 \n",
" <p><h1>FIELD</h1></p><p><span class=\"paragraph... \n",
" (WO2023/096756)<br/><p><heading>Claims: </head... \n",
" \n",
" \n",
" 1 \n",
" (EP0412229)<br/>A rotary valve assembly having... \n",
" Thermally stable sealing device for a butterfl... \n",
" 93.0 \n",
" EP-412229_A1 \n",
" EP-412229_A1 \n",
" <p><h1>BACKGROUND OF THE INVENTION</h1></p><p>... \n",
" (EP0412229)<br/><p>1. Temperature-resistant bi... \n",
" \n",
" \n",
" 2 \n",
" (DE102019112957)<br/>A sealing unit 1 comprise... \n",
" Annular seal for spark plug or the like \n",
" 93.0 \n",
" DE102019112957_A1 \n",
" DE102019112957_A1 \n",
" <p><heading level=\"1\">Field of technology</hea... \n",
" (GB2574032)<br/><p>1. A seal (1) comprising an... \n",
" \n",
" \n",
" 3 \n",
" (US20040200715)<br/>The present invention feat... \n",
" Dynamic flange seal and sealing system \n",
" 93.0 \n",
" US20040200715_A1 \n",
" US20040200715_A1 \n",
" <p><h1 align=\"LEFT\">RELATED APPLICATIONS </h1>... \n",
" (US20040200715)<br/><p><h1 align=\"LEFT\">What i... \n",
" \n",
" \n",
" \n",
"4 \n",
" (WO2018/079632)<br/>Provided are a seal struct... \n",
" Seal structure, sealing method, and coupling e... \n",
" 93.0 \n",
" WO201879632_A1 \n",
" WO201879632_A1 \n",
" <p><h1>CROSS REFERENCE TO RELATED APPLICATIONS... \n",
" (US20190293178)<br/><p><b>1</b>. A seal struct... \n",
"
Porous plugs for gas delivery ... \n",
"1 (EP0412229)
A rotary valve assembly having... \n",
"2 (DE102019112957)
A sealing unit 1 comprise... \n",
"3 (US20040200715)
The present invention feat... \n",
"4 (WO2018/079632)
Provided are a seal struct... \n",
"\n",
" TI RELEVANCE_SCORE \\\n",
"0 Porous plug for electrostatic chuck gas delivery 100.0 \n",
"1 Thermally stable sealing device for a butterfl... 93.0 \n",
"2 Annular seal for spark plug or the like 93.0 \n",
"3 Dynamic flange seal and sealing system 93.0 \n",
"4 Seal structure, sealing method, and coupling e... 93.0 \n",
"\n",
" ID REP \\\n",
"0 EP4441780_A1 EP4441780_A1 \n",
"1 EP-412229_A1 EP-412229_A1 \n",
"2 DE102019112957_A1 DE102019112957_A1 \n",
"3 US20040200715_A1 US20040200715_A1 \n",
"4 WO201879632_A1 WO201879632_A1 \n",
"\n",
" DESC \\\n",
"0 FIELD
BACKGROUND OF THE INVENTION
... \n", "2
1. Temperature-resistant bi... \n",
"2 (GB2574032) 1. A seal (1) comprising an... \n",
"3 (US20040200715) 1. A seal struct... "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Step 6: Use extracted features as search query and fetch 7 patents from Orbit API\n",
"api_response = c.semantic_search(features_only_text, ['TI', 'AB', 'ECLM', 'DESC'], count=5)\n",
"\n",
"# Convert the response to a DataFrame for easier processing\n",
"df_orbit = pd.DataFrame(api_response['documents'])\n",
"df_orbit\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" 1. Temperature-resistant bi... \n",
"2 (GB2574032) 1. A seal (1) comprising an... \n",
"3 (US20040200715) 1. A seal struct... \n",
"\n",
" DESC \n",
"0 ... \n",
"2 \n",
" text = re.sub(r\"^\\(.*?\\) \", \"\", text)\n",
" \n",
" # Step 2: Remove all content within `\n",
" text = re.sub(r\" (\\d+\\.)\", r\"\\n\\1\", text)\n",
" \n",
" # Step 4: Remove all remaining HTML-like tags\n",
" text = re.sub(r\"<[^>]+>\", \"\", text)\n",
" \n",
" # Step 5: Remove round brackets and their contents\n",
" text = re.sub(r\"\\([^)]*\\)\", \"\", text)\n",
" \n",
" # Final cleanup to remove any extraneous whitespace\n",
" return text.strip()\n",
"\n",
"\n",
"def clean_description_text(text):\n",
" # Step 1: Remove specific headings like \"CROSS REFERENCE TO RELATED APPLICATION\" within , RELATED APPLICATIONS
... \n",
"4 CROSS REFERENCE TO RELATED APPLICATIONS... \n",
"\n",
" ECLM \n",
"0 (WO2023/096756)
What i... \n",
"4 (US20190293178)
\n",
" \n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" ID \n",
" TI \n",
" AB \n",
" ECLM \n",
" DESC \n",
" \n",
" \n",
" 0 \n",
" EP4441780_A1 \n",
" Porous plug for electrostatic chuck gas delivery \n",
" (EP4441780)<br/>Porous plugs for gas delivery ... \n",
" (WO2023/096756)<br/><p><heading>Claims: </head... \n",
" <p><h1>FIELD</h1></p><p><span class=\"paragraph... \n",
" \n",
" \n",
" 1 \n",
" EP-412229_A1 \n",
" Thermally stable sealing device for a butterfl... \n",
" (EP0412229)<br/>A rotary valve assembly having... \n",
" (EP0412229)<br/><p>1. Temperature-resistant bi... \n",
" <p><h1>BACKGROUND OF THE INVENTION</h1></p><p>... \n",
" \n",
" \n",
" 2 \n",
" DE102019112957_A1 \n",
" Annular seal for spark plug or the like \n",
" (DE102019112957)<br/>A sealing unit 1 comprise... \n",
" (GB2574032)<br/><p>1. A seal (1) comprising an... \n",
" <p><heading level=\"1\">Field of technology</hea... \n",
" \n",
" \n",
" 3 \n",
" US20040200715_A1 \n",
" Dynamic flange seal and sealing system \n",
" (US20040200715)<br/>The present invention feat... \n",
" (US20040200715)<br/><p><h1 align=\"LEFT\">What i... \n",
" <p><h1 align=\"LEFT\">RELATED APPLICATIONS </h1>... \n",
" \n",
" \n",
" \n",
"4 \n",
" WO201879632_A1 \n",
" Seal structure, sealing method, and coupling e... \n",
" (WO2018/079632)<br/>Provided are a seal struct... \n",
" (US20190293178)<br/><p><b>1</b>. A seal struct... \n",
" <p><h1>CROSS REFERENCE TO RELATED APPLICATIONS... \n",
"
Porous plugs for gas delivery ... \n",
"1 (EP0412229)
A rotary valve assembly having... \n",
"2 (DE102019112957)
A sealing unit 1 comprise... \n",
"3 (US20040200715)
The present invention feat... \n",
"4 (WO2018/079632)
Provided are a seal struct... \n",
"\n",
" ECLM \\\n",
"0 (WO2023/096756)What i... \n",
"4 (US20190293178)
FIELD
BACKGROUND OF THE INVENTION
RELATED APPLICATIONS
... \n",
"4 CROSS REFERENCE TO RELATED APPLICATIONS... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df_orbit[['ID' ,'TI', 'AB', 'ECLM', 'DESC']]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Clean the orbit data "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\ayush.tiwari\\AppData\\Local\\Temp\\ipykernel_12872\\2467406529.py:65: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df['AB'] = df['AB'].apply(clean_abstract)\n",
"C:\\Users\\ayush.tiwari\\AppData\\Local\\Temp\\ipykernel_12872\\2467406529.py:66: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df['ECLM'] = df['ECLM'].apply(clean_claim_text)\n",
"C:\\Users\\ayush.tiwari\\AppData\\Local\\Temp\\ipykernel_12872\\2467406529.py:67: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df['DESC'] = df['DESC'].apply(clean_description_text)\n"
]
}
],
"source": [
"def clean_abstract(text):\n",
" # Step 1: Remove initial text up to and including the first `>`\n",
" cleaned_text = text.split('>', 1)[-1].strip()\n",
" \n",
" # Step 2: Remove any numbers within round brackets, e.g., \"(10)\"\n",
" cleaned_text = re.sub(r\"\\(\\d+\\)\", \"\", cleaned_text)\n",
" \n",
" # Final cleaning to remove extra whitespace\n",
" return cleaned_text.strip()\n",
"\n",
"def clean_claim_text(text):\n",
" # Step 1: Remove the initial patent number and any tags that follow it, such as
and tags or
.*?
| tags\n",
" # Adjusted regex to match specific
sections and remove the entire section if it matches\n",
" cleaned_text = re.sub(r\"
.*?RELATED APPLICATION.*?
, ,
\n",
" cleaned_text = re.sub(r\"?p>|
|?ul>|?li>\", \"\", cleaned_text, flags=re.DOTALL)\n",
"\n",
" # Step 3: Remove all instances of \"FIG.\", \"fig.\", followed by numbers\n",
" cleaned_text = re.sub(r\"\\bFIG\\.?\\s*\\d+|\\bfig\\.?\\s*\\d+\", \"\", cleaned_text, flags=re.IGNORECASE).strip()\n",
"\n",
" # Step 4: Replace paragraph numbers with a newline and keep the paragraph number as text\n",
" cleaned_text = re.sub(r'\\[(\\d{4})\\]\\s*', r\"\\n[\\1]\", cleaned_text)\n",
"\n",
" # Step 5: Remove standalone numbers, excluding those followed by '%' or letters\n",
" cleaned_text = re.sub(r\"\\b\\d+\\b(?![%a-zA-Z])\", \"\", cleaned_text)\n",
"\n",
" # Step 6: Remove empty square brackets created without a number\n",
" cleaned_text = re.sub(r\"\\[\\]\", \"\", cleaned_text).strip()\n",
"\n",
" # Step 7: Remove empty parentheses \"()\"\n",
" cleaned_text = re.sub(r\"\\(\\s*\\)\", \"\", cleaned_text).strip()\n",
"\n",
" # Step 8: Remove any remaining HTML tags, including those with attributes\n",
" cleaned_text = re.sub(r\"<[^>]+>\", \"\", cleaned_text).strip()\n",
"\n",
" # Step 9: Add a line number at the start of each new line\n",
" lines = cleaned_text.splitlines() # Split text by lines\n",
" cleaned_text = \"\\n\".join(f\"{i + 1}. {line.strip()}\" for i, line in enumerate(lines) if line.strip())\n",
"\n",
" return cleaned_text\n",
" \n",
"\n",
"# Apply the cleaning function to the 'ECLM' column in the DataFrame\n",
"df['AB'] = df['AB'].apply(clean_abstract)\n",
"df['ECLM'] = df['ECLM'].apply(clean_claim_text)\n",
"df['DESC'] = df['DESC'].apply(clean_description_text)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" ID \n",
" TI \n",
" AB \n",
" ECLM \n",
" DESC \n",
" \n",
" \n",
" 0 \n",
" EP4441780_A1 \n",
" Porous plug for electrostatic chuck gas delivery \n",
" Porous plugs for gas delivery in substrate sup... \n",
" 1. A porous plug for use in a substrate suppor... \n",
" 1. FIELD\\n2. Embodiments of the present disclo... \n",
" \n",
" \n",
" 1 \n",
" EP-412229_A1 \n",
" Thermally stable sealing device for a butterfl... \n",
" A rotary valve assembly having a housing, a va... \n",
" 1. Temperature-resistant bidirectional sealing... \n",
" 1. BACKGROUND OF THE INVENTIONThis invention r... \n",
" \n",
" \n",
" 2 \n",
" DE102019112957_A1 \n",
" Annular seal for spark plug or the like \n",
" A sealing unit 1 comprises annular first and s... \n",
" 1. A seal comprising an annular first seal el... \n",
" 1. Field of technology\\n2. This disclosure rel... \n",
" \n",
" \n",
" 3 \n",
" US20040200715_A1 \n",
" Dynamic flange seal and sealing system \n",
" The present invention features a flange to fla... \n",
" What is claimed and desired to be secured by L... \n",
" 1. RELATED APPLICATIONS\\n2. This application c... \n",
" \n",
" \n",
" \n",
"4 \n",
" WO201879632_A1 \n",
" Seal structure, sealing method, and coupling e... \n",
" Provided are a seal structure and sealing meth... \n",
" 1. A seal structure provided at a butt portion... \n",
" 1. This application is related to co-pending a... \n",
"