from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.tools.basic import AddCollectionFromEPMCTool
from alhazen.tools.paperqa_emulation_tool import *
from alhazen.toolkit import *
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_google_vertexai import ChatVertexAI
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from sqlalchemy import func, text
from time import time
from tqdm import tqdm
from transformers import pipeline, AutoModel, AutoTokenizer
import torch
import os
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough, RunnableLambda
from operator import itemgetter
from langchain.chat_models import ChatOllama
from langchain.schema import get_buffer_string, OutputParserException, format_document
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from alhazen.utils.output_parsers import JsonEnclosedByTextOutputParser
#from paperqa.prompts import summary_prompt as paperqa_summary_prompt, qa_prompt as paperqa_qa_prompt, select_paper_prompt, citation_prompt, default_system_prompt
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel
import local_resources.queries.rao_grantees as rao_files
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
Rare Disease Literature.
Preliminaries
Here we set up libraries and methods to create and query the local Postgres database we will be using to store our information from the Alhazen tools and agent
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the Postgres database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save files for your digital library, downloaded models or other data.
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
= os.environ['LOCAL_FILE_PATH']
loc = 'rare_as_one_diseases' db_name
Run this command to destroy your current database
USE WITH CAUTION
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
Run this command to create a new, empty database.
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
This command lists all the tools the Alhazen agent system has access to
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb
= lookup_chat_models() llms
= ChatOpenAI(base_url='https://czi-shared-infra-czi-sci-general-prod-databricks.cloud.databricks.com/serving-endpoints',
llm_databricks_llama3 =os.environ['DATABRICKS_API_KEY'],
api_key='databricks-meta-llama-3-70b-instruct') model
= llms.get('gpt4_1106')
llm_dbrx
= AlhazenAgent(db_name=db_name, agent_llm=llm_databricks_llama3, tool_llm=llm_databricks_llama3)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
Build paper collections
This section will build a literature collection across each of the diseases in the Rare As One Cohorts for cycle 1 and 2.
What diseases are we querying the literature for?
= ['ID', 'CORPUS_NAME', 'TERMS']
cols_to_include = pd.read_csv(files(rao_files).joinpath('CZI_RAO_diseases.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])
df
df
This command iterates over the list of different collections and runs a query for each one on the European website by processing the TERMS
column from the dataframe with the QueryTranslator
utility. This generates a search query in boolean logic that searches the TITLE_ABS
field in the remote database (See https://www.ebi.ac.uk/europepmc/webservices/rest/fields for possible fields to search).
= QueryTranslator(df.sort_values('ID'), 'ID', 'TERMS', 'CORPUS_NAME')
qt = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS'])
(corpus_ids, epmc_queries) = df['CORPUS_NAME']
corpus_names
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
if id < 60:
continue
={'id': id, 'name':name, 'query':query, 'full_text':False}) addEMPCCollection_tool.run(tool_input
#
# Note - create a new corpus for collaborative discussions with CellXGene team (in particular Maximillian L.)
#
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool id = '83'
= 'Diffuse Midline Glioma'
name = '"diffuse midline glioma" OR "diffuse intrinsic pontine glioma" OR "brainstem glioma" OR "diffuse intrinsic pontine glioma"'
query ={'id': id, 'name':name, 'query':query, 'full_text':False}) addEMPCCollection_tool.run(tool_input
#
# Note - create a new corpus for collaborative discussions with CellXGene team (in particular Maximillian L.)
#
= [t for t in cb.tk.get_tools() if isinstance(t, PaperQAEmulationTool)][0]
PaperQA_tool = 'Write an essay to answer the question: "What is the connection between SEC61B and the unfolded protein response?'
question ={'question': question}) PaperQA_tool.run(tool_input
'input':'Write an essay to answer the question: "What known gene variants are associated with Primary Ciliary Dyskinesia?'}) cb.agent_executor.invoke({
Crazy Bug
Running the skc_id=83
query below takes 6 minutes and skc_id=79
takes less than 1 sec.
ldb.session.rollback()
ldb.session.execute(text('''SELECT DISTINCT skc.name, ske.id, ske.content, ske.publication_date as pub_date, ske.type as pub_type, emb.embedding, skf.content
FROM langchain_pg_embedding as emb,
"ScientificKnowledgeExpression" as ske,
"ScientificKnowledgeCollection_has_members" as skc_hm,
"ScientificKnowledgeCollection" as skc,
"ScientificKnowledgeFragment" as skf
WHERE emb.cmetadata->>'i_type' = 'CitationRecord' AND
emb.cmetadata->>'e_id' = ske.id AND
emb.cmetadata->>'f_id' = skf.id AND
skc_hm."ScientificKnowledgeCollection_id" = skc.id AND
ske.id = skc_hm.has_members_id AND (skc.id='79')
ORDER BY pub_date DESC;''')).fetchall()
ldb.session.rollback()
ldb.session.execute(text('''SELECT DISTINCT skc.name, ske.id, ske.content, ske.publication_date as pub_date, ske.type as pub_type, emb.embedding, skf.content
FROM langchain_pg_embedding as emb,
"ScientificKnowledgeExpression" as ske,
"ScientificKnowledgeCollection_has_members" as skc_hm,
"ScientificKnowledgeCollection" as skc,
"ScientificKnowledgeFragment" as skf
WHERE emb.cmetadata->>'i_type' = 'CitationRecord' AND
emb.cmetadata->>'e_id' = ske.id AND
emb.cmetadata->>'f_id' = skf.id AND
skc_hm."ScientificKnowledgeCollection_id" = skc.id AND
ske.id = skc_hm.has_members_id AND (skc.id='83')
ORDER BY pub_date DESC;''')).fetchall()
= ldb.session.query(SKE).distinct(SKE.id)
q print(q.count())
all()) ldb.embed_expression_list(q.
'input':'Get full text copies of all papers in the collection with id="0".'}) cb.agent_executor.invoke({
= ldb.session.query(SKC.id, SKC.name, func.count(SKC_HM.has_members_id)) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.id, SKC.name) \
.group_by(SKC.id.cast(Integer))
.order_by(SKC.= pd.DataFrame(q.all(), columns=['Corpus ID', 'Corpus Name', 'Paper Count'])
corpora_df
= ldb.session.query(func.count(SKE.id)).first()
paper_count print('Count of all papers in database: %d'%(paper_count[0]))
corpora_df
= ldb.session.query(N) \
q3 filter(N.type == 'NoteAboutFragment')
.
for n in q3.all():
= json.loads(n.content)
n_content print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(n.id)
print(n_content.get('response'))
print(n_content.get('data'))
= ldb.session.query(SKE).all()
skes #ldb.embed_expression_list(skes)
print(len(skes))
ldb.session.rollback()
= [t for t in tk.get_tools() if isinstance(t, RetrieveFullTextTool)][0]
ft_retriever
for i, c in corpora_df.iterrows():
if c['Corpus ID'] != '81':
continue
print(c['Corpus Name'])
= 0
ft_count = 0
no_ft_count = [e.id for e in ldb.list_expressions(collection_id=c['Corpus ID'])]
doi_list for doi in doi_list:
= doi.replace('doi:', '')
d2 = loc+db_name+'/ft/'
path = path+'/'+d2+'.nxml'
nxml_file_path = path+'/'+d2+'.pdf'
pdf_file_path = path+'/'+d2+'.html'
html_file_path if os.path.exists(nxml_file_path) or \
or \
os.path.exists(pdf_file_path)
os.path.exists(html_file_path):+= 1
ft_count try:
+= 1
no_ft_count #print('\t'+doi)
={'paper_id': doi})
ft_retriever.run(tool_inputexcept Exception as e:
print(e)
print(ft_count)
print(no_ft_count)
= ldb.session.query(SKE.id, SKI.id, SKI.type, SKF.id, SKF.type, SKF.offset, SKF.content) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id==SKF.id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKF.type=='section') \
.filter(SKI.type.like('%FullText')) \
.id, SKF.offset)
.order_by(SKE.= pd.DataFrame(q.all(), columns=['doi', 'item_id', 'item_type', 'fragment_id', 'fragment_type', 'offset', 'content'])
items_df
items_df
Index the abstracts and run some simple semantic queries
Here we index each paper’s title and abstract to build a simple question / answer interface.
ldb.session.rollback()
for i, c in tqdm(corpora_df.iterrows()):
if c['Corpus ID'] != '81':
continue
= ldb.list_expressions(collection_id=c['Corpus ID'])
expressions ldb.embed_expression_list(expressions)
= 'What is known about genetics underlying Stiff Person Syndrome?'
question
=10, collection_name='ScienceKnowledgeItem_FullText') ldb.query_vectorindex(question, k
ATTEMPTING TO RECONSTRUCT PAPER-QA PIPELINE IN OUR SYSTEM.
- Embed paper sections + question
- Given the question, summarize the retrieved paper sections relative to the question
- Score and select relevant passages
- Put summaries into prompt
- Generate answer with prompt
'PGVECTOR_CONNECTION_STRING'] = "postgresql+psycopg2:///"+ldb.name
os.environ[= PGVector.from_existing_index(
vectorstore = ldb.embed_model,
embedding = 'ScienceKnowledgeItem')
collection_name = vectorstore.as_retriever(search_kwargs={'k':15, 'filter': {'skc_ids': 81}})
retriever #retriever = vectorstore.as_retriever()
retriever.invoke(question)
= '''First, read through the following JSON encoding of {k} research articles:
hum_p
Each document has three attributes: (A) a digital object identifier ('DOI') code, (B) a CITATION string containing the authors, publication year, title and publication location, and the (C) CONTENT field with the title and abstract of the paper.
```json:{context}```
Then, generate a JSON list of summaries of each article in order to help answer the following question:
Question: {question}
Do NOT directly answer the question, instead summarize to give evidence to help answer the question.
Focus on specific details, including numbers, equations, or specific quotes.
Reply "Not applicable" if text is irrelevant.
Restrict each summary to {summary_length} words.
Also, provide a score from 1-10 indicating relevance to question. Do not explain your score.
Write this answer as JSON formatted output. Provide a list of {k} dict objects with the following fields: DOI, SUMMARY, RELEVANCE SCORE.
Do not provide additional explanation for the answer.
Do not include any other response other than a JSON object.
'''
= '''Answer in a direct and concise tone. Your audience is an expert, so be highly specific. If there are ambiguous terms or acronyms, first define them.'''
sys_p
= PromptTemplate.from_template(template="'DOI': '{ske_id}', CITATION: '{citation}', CONTENT:'{page_content}'")
DEFAULT_DOCUMENT_PROMPT def combine_documents(
=DEFAULT_DOCUMENT_PROMPT, document_separator="},{\n"
docs, document_prompt
):= [format_document(doc, document_prompt) for doc in docs]
doc_strings return '[{'+document_separator.join(doc_strings)+'}]'
= ChatPromptTemplate.from_messages([
template "system", sys_p),
("human", hum_p)])
(
= (
qa_chain
RunnableParallel({"k": itemgetter("k"),
"question": itemgetter("question"),
"summary_length": itemgetter("summary_length"),
"context": itemgetter("question") | retriever | combine_documents,
})| {
"summary": template | ChatOllama(model='mixtral') | JsonEnclosedByTextOutputParser(),
"context": itemgetter("context"),
}
)
input = {'question': question, 'summary_length': 1000, 'k':5}
= qa_chain.invoke(input, config={'callbacks': [ConsoleCallbackHandler()]})
out print(json.dumps(out, indent=4))
Discourse Analysis
= '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
# Try an out-of-the-box classifier on the data for discourse tagging.
ldb.session.rollback()= (datetime.now() - timedelta(days=1*365))
one_year_ago
= ldb.session.query(SKE, SKF) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id==SKF.id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.type == 'CitationRecord' ) \
.id)
.order_by(SKE.
# .filter(SKC.name == 'The Stiff Person Syndrome' ) \
# .filter(SKE.publication_date >= one_year_ago) \
= []
s_list for e, f in q.all():
for i, s in enumerate(ldb.sent_detector.tokenize(f.content)):
id, f.id, i, s])
s_list.append([e.= pd.DataFrame(s_list, columns=['doi', 'f_id', 's_id', 'text'])
sent_df sent_df
# Predict multipe texts on single CPU and time the inference duration
= time()
start
= sent_df
df
= classifier([row.text for i, row in df.iterrows()])
preds = pd.DataFrame(preds)
pred_df 'label'] = [lookup[row.label] for i, row in pred_df.iterrows()]
df['score'] = [row.score for i, row in pred_df.iterrows()]
df[
= time()
end
print('Prediction time:', str(timedelta(seconds=end-start)))
df
ldb.session.rollback()
# Generate fragment sentences and add them as Notes
ldb.session.rollback()for i, row in df.iterrows():
= ldb.session.query(SKF).filter(SKF.id == row.f_id).first()
f_q = ldb.session.query(SKI).filter(SKI.id == row.f_id.split('.')[0]).first()
i_q = i_q.content.find(row.text)
o = len(row.text)
l = ScientificKnowledgeFragment(id=f_q.id+'.'+str(row.s_id), \
sentence_fragment =row.text, \
content=o, \
offset=l, \
lengthtype='sentence')
i_q.has_part.append(sentence_fragment)= {'discourse_label': row.label, 'score': row.score}
note_content = Note(id=f_q.id+'.'+str(row.s_id)+'.discourse_type',
n =json.dumps(note_content, indent=4),
contentformat='json',
type='NoteAboutFragment')
sentence_fragment.has_notes.append(n)
ldb.session.flush() ldb.session.commit()
Running DRSM Classifiers.
= '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
Topic Modeling over the corpus.
What are the main topics being discussed in each paper?
= '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
Search for and download Full Text Papers.
Can we search for all Stiff Person Syndrome papers published in the last 10 years?
ldb.session.rollback()
= (datetime.now() - timedelta(days=10*365))
ten_years_ago print(ten_years_ago)
= ldb.session.query(func.extract('year', SKE.publication_date.cast(Date)), func.count(SKE.id) ) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.publication_date >= ten_years_ago) \
.filter(SKC.name == 'The Stiff Person Syndrome' ) \
.'year', SKE.publication_date.cast(Date))) \
.group_by(func.extract('year', SKE.publication_date.cast(Date)))
.order_by(func.extract(= pd.DataFrame(q.all(), columns=['doi', 'date'])
sps_pubcount_df sps_pubcount_df
Run PaperQA
'input':'Write a short essay on "What connections between primary ciliary diskinesia and primary cilia have been studied?" based on the collection with ID="70".'}) cb.agent_executor.invoke({