NCATS Natural History Studies

Using Alhazen to extract information from Natural History Studies.

Note - this question is inherently driven by discussion and informal experience (as opposed to formal experimentation). So we would expect to

Preliminaries

from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database
from alhazen.utils.searchEngineUtils import *

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_IRI = aliased(ScientificKnowledgeExpressionIri)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_MO = aliased(ScientificKnowledgeExpressionMemberOf)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
NIA = aliased(NoteIsAbout)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

ALHAZEN_DB_NAME - the name of the PostGresQL database you are storing information into
LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.

os.environ['ALHAZEN_DB_NAME'] = 'natural_history_studies'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

backup_ceifns_database('natural_history_studies', loc+'/natural_history_studies.db')

drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
#llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)

cb = AlhazenAgent(llm2, llm2)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

dois = ['10.1007/s40123-019-00218-9', 
                    '10.1136/heartjnl-2013-304920',
                    '10.21037/cdt.2018.09.18',
                    '10.1038/sc.2013.170',
                    '10.1016/j.jacc.2006.07.053',
                    '10.1186/s12884-016-1076-8',
                    '10.1200/PO.20.00218',
                    '10.1056/NEJMoa021736',
                    '10.1093/europace/euw067',
                    '10.7150/jca.32579']

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 40
for start_i in range(0, len(dois), step):
    query = ' OR '.join(['doi:\"'+dois[i]+'\"' for i in range(start_i, start_i+step) if i < len(dois)])
    addEMPCCollection_tool.run({'id': '0', 'name':'Basic Extraction Demo', 'query':query, 'full_text':True})

cb.agent_executor.invoke({'input':'Download all available full text for papers in the collection with id="0"'})

ldb.report_collection_composition()

study_type = 'natural history studies'

# Get the metadata extraction tool
t2 = [t for t in cb.tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for d in dois:
    item_types = set()
    d_id = 'doi:'+d
    df2 = pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
    df = pd.concat([df, df2])

# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
    item_types = set()

    d_id = 'doi:'+d

    # Skip if the doi is already in the database
    #if len(df)>0 and d_id in df.doi.unique():
    #    continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d_id, 'extraction_type': study_type})

    # Add the results to the dataframe
    df2 = pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
    df = pd.concat([df, df2])

df = pd.DataFrame()
for d in [d for d in dois]:
    d_id = 'doi:'+d
    df2 = pd.DataFrame(t2.read_metadata_extraction_notes(d_id, study_type))
    df = pd.concat([df, df2]) 
df.to_csv(loc+'/nhs_metadata_extraction.csv', index=False, sep='\t')

# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
q3 = ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'MetadataExtractionNote') 
for row in q3.all():
    d_id = row[0]
    e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
    notes_to_delete = []
    for n in ldb.read_notes_about_x(e):
        notes_to_delete.append(n.id)
    for n in notes_to_delete:
        ldb.delete_note(n)

backup_ceifns_database(os.environ['ALHAZEN_DB_NAME'], loc+'/nhs_metadata_extraction.db.backup2')