from alhazen.aliases import *
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, text, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml
import pymde
import torch
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from alhazen.utils.searchEngineUtils import load_paper_from_openalex, read_references_from_openalex
from pyalex import config, Works, Work
= "gully.burns@chanzuckerberg.com"
config.email
import requests
import os
import local_resources.data_files.rnaquarium as rnaquarium
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
RNAquarium
Basics
Python Imports
Setting python imports, environment variables, and other crucial set up parameters here.
Environment Variables
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
'ALHAZEN_DB_NAME'] = 'rnaquarium'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/' os.environ[
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
if os.environ.get('ALHAZEN_DB_NAME') is None:
raise Exception('Which database do you want to use for this application?')
= os.environ['ALHAZEN_DB_NAME']
db_name
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
= os.environ['LOCAL_FILE_PATH'] loc
Setup utils, agents, and tools
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = ChatOllama(model='mixtral:instruct')
llm = ChatOpenAI(model='gpt-4-1106-preview')
llm2 = ChatOpenAI(model='gpt-4-1106-preview')
llm2 #llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)
= AlhazenAgent(llm2, llm2)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
= MetadataExtractionToolkit(db=ldb, llm=llm2)
test_tk print('\nTESTING TOOLS')
for t in test_tk.get_tools():
print('\t'+type(t).__name__)
Building the database
Scripts to Build / Delete the database
If you need to restore a deleted database from backup, use the following shell commands:
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
Build CEIFNS database from 900 dois in database
Load data from the spreadsheet
= pd.read_csv(files(rnaquarium).joinpath('RNAquarium_paper_list.tsv'), sep='\t')
df = df['DOI'].to_list()
dois df
Run this cell to execute paged queries (length 40) over the European PMC for each of the DOIs mentioned in the spreadsheet loaded above.
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool = 40
step for start_i in range(0, len(dois), step):
= ' OR '.join(['doi:\"'+dois[i]+'\"' for i in range(start_i, start_i+step) if i < len(dois)])
query 'id': '0', 'name':'RNAquarium Papers', 'query':query, 'full_text':True}) addEMPCCollection_tool.run({
Run this cell to check how many papers from the list are loaded in our database.
# Compare contents of database to the list of dois
= []
missing_list = []
titles for doi in dois:
= df[df['DOI']==doi]
row = ldb.session.query(SKE).filter(SKE.id=='doi:'+doi.lower()).all()
doi_in_db if len(doi_in_db) == 0:
print('DOI: '+doi)
print('\t%s (%d) %s %s'%(row['Author'].iloc[0],row['Publication Year'].iloc[0],row['Title'].iloc[0],row['Journal Abbreviation'].iloc[0]))
missing_list.append(doi)'Title'].iloc[0])
titles.append(row[print('%d Missing DOIs'%(len(missing_list)))
Use OpenAlex as filler to add papers that were missed on EPMC
ldb.session.rollback()= ldb.session.query(SKC).filter(SKC.id=='0').first()
corpus = 0
count print(len(corpus.has_members))
= []
papers_to_index for i, doi in enumerate(missing_list):
= load_paper_from_openalex(doi)
p
ldb.session.add(p)
corpus.has_members.append(p)
p.member_of.append(corpus)for item in p.has_representation:
for f in item.has_part:
#f.content = '\n'.join(self.sent_detector.tokenize(f.content))
= item.id
f.part_of
ldb.session.add(f)= p.id
item.represented_by
ldb.session.add(item)
papers_to_index.append(p)
ldb.session.flush()
ldb.embed_expression_list(papers_to_index)
ldb.session.commit()
Get full text copies of all the papers about CryoET
This invokes the agent directly to make it easy to run the retrieval tool.
cb.db.session.rollback()'input':'Retrieve full text for the collection with id="0".'}) cb.agent_executor.invoke({
Analyze Collections
Build a basic report of the composition over all collections in the database (listed by types of items).
cb.db.report_collection_composition()
0) cb.db.report_non_full_text_for_collection(
Tests + Checks
Agent tool selection + execution + interpretation
# use this cell to test the agent's
'input':'Hi who are you and what can you do?'}) cb.agent_executor.invoke({
Run MetaData Extraction Chain over listed papers
Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.
str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]
# Get the metadata extraction tool
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]
t2
# Hack to get the path to the metadata directory as a string
= str(files(rnaquarium).joinpath('temp'))[0:-4]
metadata_dir
# Compile the answers from the metadata directory
#t2.compile_answers('cryoet', metadata_dir)
# Create a dataframe to store previously extracted metadata
= pd.DataFrame()
df for d_id in dois:
= set()
item_types #d_id = 'doi:'+d
= pd.DataFrame(t2.read_metadata_extraction_notes(d_id, 'rnaquarium'))
df2 = pd.concat([df, df2])
df
# Iterate over papers to run the metadata extraction tool
for d_id in dois[0:10]:
= set()
item_types #d_id = 'doi:'+d
# Skip if the doi is already in the database
if len(df)>0 and d_id in df.doi.unique():
continue
# Run the metadata extraction tool on the doi
={'paper_id': d_id, 'extraction_type': 'rnaquarium'})
t2.run(tool_input
# Add the results to the dataframe
= pd.DataFrame(t2.read_metadata_extraction_notes(d_id, 'rnaquarium'))
df2 = pd.concat([df, df2]) df
= cb.db.session.query(N) \
q filter(N.id == NIA.Note_id) \
.filter(N.type == 'MetadataExtractionNote') \
.filter(N.name.like('rnaquarium_%'))
.= []
l for n in q.all():
= json.loads(n.content)
tup = n.name.split('__')
t, doi, label 'doi'] = 'doi:'+doi
tup['extraction_type'] = t
tup['run_label'] = label
tup[
l.append(tup)= pd.DataFrame(l).set_index('doi')
report_df report_df
+'/rnaquarium_metadata_extraction_report.tsv', sep='\t') report_df.to_csv(loc
# Create a dataframe to store previously extracted metadata
= pd.DataFrame()
df for d_id in dois:
= pd.DataFrame(t2.read_metadata_extraction_notes(d_id, 'rnaquarium'))
df2 = pd.concat([df, df2])
df df
ldb.session.rollback()
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for d in list(set(dois[0:10])):
= 'doi:'+d
d_id = ldb.session.query(SKE).filter(SKE.id==d_id).first()
e = []
notes_to_delete if e is None:
continue
for n in ldb.read_notes_about_x(e):
id)
notes_to_delete.append(n.for n in notes_to_delete:
ldb.delete_note(n)
Protocol Modeling + Extraction
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = ChatOllama(model='stablelm-zephyr')
slm = ChatOllama(model='mixtral:instruct')
llm = ChatOpenAI(model='gpt-4-1106-preview')
llm2 = ("This tool attempts to draw a protocol design from the description of a scientific paper.")
d = ProcotolExtractionTool(db=ldb, llm=llm2, description=d)
t ={'paper_id': 'doi:10.1101/2022.04.12.488077', 'extraction_type': 'cryoet'}) t.run(tool_input
ldb.session.rollback()= [json.loads(e[0]) for e in ldb.session.execute(text("""
rag_embeddings_list SELECT DISTINCT emb.embedding
FROM langchain_pg_embedding as emb,
"ScientificKnowledgeExpression" as ske,
"ScientificKnowledgeCollection_has_members" as skc_hm
WHERE cmetadata->>'i_type' = 'CitationRecord' AND
cmetadata->>'e_id' = ske.id AND
ske.id = skc_hm.has_members_id AND
skc_hm."ScientificKnowledgeCollection_id"='0';
""")).fetchall()]
= torch.FloatTensor(rag_embeddings_list)
rag_embeddings_tensor
= pymde.preserve_neighbors(rag_embeddings_tensor, constraint=pymde.Standardized()).embed()
proj_embeddings pymde.plot(proj_embeddings)