from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database
from alhazen.utils.searchEngineUtils import *
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml
NCATS Natural History Studies
Using Alhazen to extract information from Natural History Studies.
Note - this question is inherently driven by discussion and informal experience (as opposed to formal experimentation). So we would expect to
Preliminaries
# Using Aliases like this massively simplifies the use of SQLAlchemy
= aliased(InformationResource)
IR
= aliased(ScientificKnowledgeCollection)
SKC = aliased(ScientificKnowledgeCollectionHasMembers)
SKC_HM = aliased(ScientificKnowledgeExpression)
SKE = aliased(ScientificKnowledgeExpressionXref)
SKE_XREF = aliased(ScientificKnowledgeExpressionIri)
SKE_IRI = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_HR = aliased(ScientificKnowledgeExpressionMemberOf)
SKE_MO = aliased(ScientificKnowledgeItem)
SKI = aliased(ScientificKnowledgeItemHasPart)
SKI_HP = aliased(ScientificKnowledgeFragment)
SKF
= aliased(Note)
N = aliased(NoteIsAbout)
NIA = aliased(ScientificKnowledgeCollectionHasNotes)
SKC_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKE_HN = aliased(ScientificKnowledgeItemHasNotes)
SKI_HN = aliased(ScientificKnowledgeFragmentHasNotes) SKF_HN
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
'ALHAZEN_DB_NAME'] = 'natural_history_studies'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'
os.environ[
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
if os.environ.get('ALHAZEN_DB_NAME') is None:
raise Exception('Which database do you want to use for this application?')
= os.environ['ALHAZEN_DB_NAME']
db_name
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
= os.environ['LOCAL_FILE_PATH'] loc
'natural_history_studies', loc+'/natural_history_studies.db') backup_ceifns_database(
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = ChatOllama(model='mixtral:instruct')
llm = ChatOpenAI(model='gpt-4-1106-preview')
llm2 = ChatOpenAI(model='gpt-4-1106-preview')
llm2 #llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)
= AlhazenAgent(llm2, llm2)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
= ['10.1007/s40123-019-00218-9',
dois '10.1136/heartjnl-2013-304920',
'10.21037/cdt.2018.09.18',
'10.1038/sc.2013.170',
'10.1016/j.jacc.2006.07.053',
'10.1186/s12884-016-1076-8',
'10.1200/PO.20.00218',
'10.1056/NEJMoa021736',
'10.1093/europace/euw067',
'10.7150/jca.32579']
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool = 40
step for start_i in range(0, len(dois), step):
= ' OR '.join(['doi:\"'+dois[i]+'\"' for i in range(start_i, start_i+step) if i < len(dois)])
query 'id': '0', 'name':'Basic Extraction Demo', 'query':query, 'full_text':True}) addEMPCCollection_tool.run({
'input':'Download all available full text for papers in the collection with id="0"'}) cb.agent_executor.invoke({
ldb.report_collection_composition()
= 'natural history studies' study_type
# Get the metadata extraction tool
= [t for t in cb.tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]
t2
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= pd.DataFrame()
df for d in dois:
= set()
item_types = 'doi:'+d
d_id = pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
df2 = pd.concat([df, df2]) df
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
= set()
item_types
= 'doi:'+d
d_id
# Skip if the doi is already in the database
#if len(df)>0 and d_id in df.doi.unique():
# continue
# Run the metadata extraction tool on the doi
={'paper_id': d_id, 'extraction_type': study_type})
t2.run(tool_input
# Add the results to the dataframe
= pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
df2 = pd.concat([df, df2]) df
= pd.DataFrame()
df for d in [d for d in dois]:
= 'doi:'+d
d_id = pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
df2 = pd.concat([df, df2])
df +'/nhs_metadata_extraction.csv', index=False, sep='\t') df.to_csv(loc
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
= ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
q3 filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'MetadataExtractionNote')
.for row in q3.all():
= row[0]
d_id = ldb.session.query(SKE).filter(SKE.id==d_id).first()
e = []
notes_to_delete for n in ldb.read_notes_about_x(e):
id)
notes_to_delete.append(n.for n in notes_to_delete:
ldb.delete_note(n)
'ALHAZEN_DB_NAME'], loc+'/nhs_metadata_extraction.db.backup2') backup_ceifns_database(os.environ[