import openai
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import InMemorySaver
# from dotenv import load_dotenv
# load_dotenv()
GENAI_API_URL = os.getenv('GENAI_API_URL')
GENAI_API_KEY = os.getenv('GENAI_API_KEY')
CHAT_MODEL='gpt-4.1'
client=openai.OpenAI(api_key=GENAI_API_KEY, base_url=GENAI_API_URL, timeout=300)
model = ChatOpenAI(model=CHAT_MODEL, temperature=0.0,
api_key=GENAI_API_KEY, base_url=GENAI_API_URL,
timeout=300)
def get_unique_addresses(addresses_df:pd.DataFrame, skip_pcodes:list=None)->list:
"""from a list of addresses (pd.DataFrame), identify unique addresses and return them as a list of dict.
It uses LLM to provide fuzzy matching.
The list should be small, it only make sense to do this if all the addresses belong to same suburb / postcode.
Args:
addresses_df - the pandas DataFrame containing the addresses to be cleaned
skip_pcodes - list of postcodes to be skipped (for resume)
"""
ADDR_COLS=['TRADING_ADDRESS1', 'TRADING_ADDRESS2', 'TRADING_ADDRESS_CITY', 'TRADING_POSTCODE']
checkpointer = InMemorySaver()
agent = create_agent(model=model,
system_prompt="You are a helpful assistant.",
checkpointer=checkpointer)
config = {"configurable": {"thread_id": "address chat"}}
postcodes = addresses_df['TRADING_POSTCODE'].astype(str).unique()
if skip_pcodes:
postcodes = [pc for pc in postcodes if pc not in skip_pcodes]
for pcode in postcodes:
pcode_addr_df = addresses_df[addresses_df['TRADING_POSTCODE']==pcode]
full_addresses:List[str] = pcode_addr_df[ADDR_COLS].apply(
lambda x: ', '.join(x.dropna().astype(str).str.strip()), axis=1
)
with pd.option_context('mode.chained_assignment', None):
pcode_addr_df['TRADING_ADDRESS'] = full_addresses
prompt = f"""Here is a list of addresses as multi-line strings. Some of the addresses are the same even though they contain slightly different spelling or formats (e.g. with or without dashes '-').
Full words and their abbreviations should be consider the same - e.g. St and Street are the same.
Some addresses are in address line 1, some spreads to both line 1 and line 2, but if they represent the same physical location, then they are the same.
Some addresses have the building name, some don't; but as long as they have the same physical location, they should be consider the same.
Some addresses have consecutive street numbers but written differently - e.g. using '-' or 'TO'. In such cases, use dash '-', instead of 'TO'.
Check through the list and identify the unique addresses.
Go through the list of input addresses, replace them with their corresponding unique address.
Return the updated addresses as a list of json messages using 'ADDRESS' as key, in a json code block using markdown code fences and nothing else. Make sure for each input address, you return a cleaned up address, so that the number of addresses in the input is the same as the returned addresses.
{full_addresses.unique()}
"""
response = agent.invoke(
{"messages": [("user", prompt)]},
config=config
)
clean_addresses = []
for addr in extract_json_in_md(response['messages'][-1].content):
clean_addresses.append(format_trading_address(addr))
# Create a dictionary with unique addresses as keys
address_dict = dict(zip(full_addresses.unique(), clean_addresses))
with pd.option_context('mode.chained_assignment', None):
pcode_addr_df['TRADING_ADDRESS'] = pcode_addr_df['TRADING_ADDRESS'].map(address_dict).fillna(pcode_addr_df['TRADING_ADDRESS'])
addresses_df.loc[pcode_addr_df.index, 'TRADING_ADDRESS'] = pcode_addr_df['TRADING_ADDRESS']
yield addresses_df, pcode_addr_df.index, pcode
def extract_json_in_md(markdown_text:str)->dict:
"""
Extracts a JSON code block from markdown text and parses it into a Python dictionary.
Args:
markdown_text (str): A string containing markdown, potentially with a JSON code block.
Returns:
dict or None: The parsed Python dictionary, or None if no valid JSON block is found.
"""
# This regex looks for ```json, captures everything (non-greedy) until the next ```
# re.DOTALL (re.S) allows the dot (.) to match newline characters.
pattern = re.compile(r'```json\n(.*?)\n```', re.DOTALL | re.IGNORECASE)
match = pattern.search(markdown_text)
if match:
json_string = match.group(1).strip()
try:
# Use the json module to parse the extracted string into a Python dict
data = json.loads(json_string)
return data
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
print("markdown test: ", markdown_text)
return None
else:
print("No JSON code block found.")
return None