from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.tools.basic import *
from alhazen.tools.paperqa_emulation_tool import *
from alhazen.toolkit import *
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from sqlalchemy import func, text
from time import time
from tqdm import tqdm
from transformers import pipeline, AutoModel, AutoTokenizer
from transformers import pipeline, AutoModel, AutoTokenizer
import torch
import os
import torch
from transformers import pipeline
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough, RunnableLambda
from operator import itemgetter
from langchain.chat_models import ChatOllama
from langchain.schema import get_buffer_string, OutputParserException, format_document
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from alhazen.utils.output_parsers import JsonEnclosedByTextOutputParser
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel
Cell Biology Database.
Preliminaries
Here we set up libraries and methods to create and query the local Postgres database we will be using to store our information from the Alhazen tools and agent
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the Postgres database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save files for your digital library, downloaded models or other data.
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
= os.environ['LOCAL_FILE_PATH']
loc = 'cell_biology' db_name
Run this command to destroy your current database
USE WITH CAUTION
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
Run this command to create a new, empty database.
create_ceifns_database(db_name)
This command lists all the tools the Alhazen agent system has access to
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb
= lookup_chat_models()
llms = llms.get('gpt4_1106')
llm_dbrx
= AlhazenAgent(db_name=db_name, agent_llm=llm_dbrx, tool_llm=llm_dbrx)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
Build paper collections
This section will build a literature collection across each of the diseases in the Rare As One Cohorts for cycle 1 and 2.
What diseases are we querying the literature for?
#
# Note - create a new corpus for collaborative discussions with CellXGene team (in particular Maximillian L.)
#
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool id = 'SEC61B + Unfolded Protein Response'
= 'Question: What is the connection between SEC61B and the unfolded protein response?'
name = 'What is the connection between SEC61B and the unfolded protein response?'
query ={'id': id, 'name':name, 'query':query, 'full_text':False}) addEMPCCollection_tool.run(tool_input
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool id = 'SEC61B + Unfolded Protein Response'
= 'Question: What is the connection between SEC61B and the unfolded protein response?'
name = 'What is the connection between SEC61B and the unfolded protein response?'
query ={'id': id, 'name':name, 'query':query, 'full_text':False}) addEMPCCollection_tool.run(tool_input
#
# Note - create a new corpus for collaborative discussions with CellXGene team (in particular Maximillian L.)
#
= [t for t in cb.tk.get_tools() if isinstance(t, RetrieveFullTextToolForACollection)][0]
retrieveFullTextTool_tool ={'collection_id': id}) retrieveFullTextTool_tool.run(tool_input
#
# Note - Run this as an example question based on available data.
#
= [t for t in cb.tk.get_tools() if isinstance(t, PaperQAEmulationTool)][0]
PaperQA_tool = 'Write an essay to answer the question: "What is the connection between SEC61B and the unfolded protein response?'
question ={'question': question}) PaperQA_tool.run(tool_input
Crazy Bug
Running the skc_id=83
query below takes 6 minutes and skc_id=79
takes less than 1 sec.
ldb.session.rollback()
ldb.session.execute(text('''SELECT DISTINCT skc.name, ske.id, ske.content, ske.publication_date as pub_date, ske.type as pub_type, emb.embedding, skf.content
FROM langchain_pg_embedding as emb,
"ScientificKnowledgeExpression" as ske,
"ScientificKnowledgeCollection_has_members" as skc_hm,
"ScientificKnowledgeCollection" as skc,
"ScientificKnowledgeFragment" as skf
WHERE emb.cmetadata->>'i_type' = 'CitationRecord' AND
emb.cmetadata->>'e_id' = ske.id AND
emb.cmetadata->>'f_id' = skf.id AND
skc_hm."ScientificKnowledgeCollection_id" = skc.id AND
ske.id = skc_hm.has_members_id AND (skc.id='79')
ORDER BY pub_date DESC;''')).fetchall()
ldb.session.rollback()
ldb.session.execute(text('''SELECT DISTINCT skc.name, ske.id, ske.content, ske.publication_date as pub_date, ske.type as pub_type, emb.embedding, skf.content
FROM langchain_pg_embedding as emb,
"ScientificKnowledgeExpression" as ske,
"ScientificKnowledgeCollection_has_members" as skc_hm,
"ScientificKnowledgeCollection" as skc,
"ScientificKnowledgeFragment" as skf
WHERE emb.cmetadata->>'i_type' = 'CitationRecord' AND
emb.cmetadata->>'e_id' = ske.id AND
emb.cmetadata->>'f_id' = skf.id AND
skc_hm."ScientificKnowledgeCollection_id" = skc.id AND
ske.id = skc_hm.has_members_id AND (skc.id='83')
ORDER BY pub_date DESC;''')).fetchall()
= ldb.session.query(SKE).distinct(SKE.id)
q print(q.count())
all()) ldb.embed_expression_list(q.
'input':'Get full text copies of all papers in the collection with id="0".'}) cb.agent_executor.invoke({
= ldb.session.query(SKC.id, SKC.name, func.count(SKC_HM.has_members_id)) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.id, SKC.name) \
.group_by(SKC.id.cast(Integer))
.order_by(SKC.= pd.DataFrame(q.all(), columns=['Corpus ID', 'Corpus Name', 'Paper Count'])
corpora_df
= ldb.session.query(func.count(SKE.id)).first()
paper_count print('Count of all papers in database: %d'%(paper_count[0]))
corpora_df
= ldb.session.query(N) \
q3 filter(N.type == 'NoteAboutFragment')
.
for n in q3.all():
= json.loads(n.content)
n_content print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(n.id)
print(n_content.get('response'))
print(n_content.get('data'))
= ldb.session.query(SKE).all()
skes #ldb.embed_expression_list(skes)
print(len(skes))
ldb.session.rollback()
= [t for t in tk.get_tools() if isinstance(t, RetrieveFullTextTool)][0]
ft_retriever
for i, c in corpora_df.iterrows():
if c['Corpus ID'] != '81':
continue
print(c['Corpus Name'])
= 0
ft_count = 0
no_ft_count = [e.id for e in ldb.list_expressions(collection_id=c['Corpus ID'])]
doi_list for doi in doi_list:
= doi.replace('doi:', '')
d2 = loc+db_name+'/ft/'
path = path+'/'+d2+'.nxml'
nxml_file_path = path+'/'+d2+'.pdf'
pdf_file_path = path+'/'+d2+'.html'
html_file_path if os.path.exists(nxml_file_path) or \
or \
os.path.exists(pdf_file_path)
os.path.exists(html_file_path):+= 1
ft_count try:
+= 1
no_ft_count #print('\t'+doi)
={'paper_id': doi})
ft_retriever.run(tool_inputexcept Exception as e:
print(e)
print(ft_count)
print(no_ft_count)
= ldb.session.query(SKE.id, SKI.id, SKI.type, SKF.id, SKF.type, SKF.offset, SKF.content) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id==SKF.id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKF.type=='section') \
.filter(SKI.type.like('%FullText')) \
.id, SKF.offset)
.order_by(SKE.= pd.DataFrame(q.all(), columns=['doi', 'item_id', 'item_type', 'fragment_id', 'fragment_type', 'offset', 'content'])
items_df
items_df
Index the abstracts and run some simple semantic queries
Here we index each paper’s title and abstract to build a simple question / answer interface.
ldb.session.rollback()
for i, c in tqdm(corpora_df.iterrows()):
if c['Corpus ID'] != '81':
continue
= ldb.list_expressions(collection_id=c['Corpus ID'])
expressions ldb.embed_expression_list(expressions)
= 'What is known about genetics underlying Stiff Person Syndrome?'
question
=10, collection_name='ScienceKnowledgeItem_FullText') ldb.query_vectorindex(question, k
ATTEMPTING TO RECONSTRUCT PAPER-QA PIPELINE IN OUR SYSTEM.
- Embed paper sections + question
- Given the question, summarize the retrieved paper sections relative to the question
- Score and select relevant passages
- Put summaries into prompt
- Generate answer with prompt
'PGVECTOR_CONNECTION_STRING'] = "postgresql+psycopg2:///"+ldb.name
os.environ[= PGVector.from_existing_index(
vectorstore = ldb.embed_model,
embedding = 'ScienceKnowledgeItem')
collection_name = vectorstore.as_retriever(search_kwargs={'k':15, 'filter': {'skc_ids': 81}})
retriever #retriever = vectorstore.as_retriever()
retriever.invoke(question)
#from paperqa.prompts import summary_prompt as paperqa_summary_prompt, qa_prompt as paperqa_qa_prompt, select_paper_prompt, citation_prompt, default_system_prompt
= '''First, read through the following JSON encoding of {k} research articles:
hum_p
Each document has three attributes: (A) a digital object identifier ('DOI') code, (B) a CITATION string containing the authors, publication year, title and publication location, and the (C) CONTENT field with the title and abstract of the paper.
```json:{context}```
Then, generate a JSON list of summaries of each article in order to help answer the following question:
Question: {question}
Do NOT directly answer the question, instead summarize to give evidence to help answer the question.
Focus on specific details, including numbers, equations, or specific quotes.
Reply "Not applicable" if text is irrelevant.
Restrict each summary to {summary_length} words.
Also, provide a score from 1-10 indicating relevance to question. Do not explain your score.
Write this answer as JSON formatted output. Provide a list of {k} dict objects with the following fields: DOI, SUMMARY, RELEVANCE SCORE.
Do not provide additional explanation for the answer.
Do not include any other response other than a JSON object.
'''
= '''Answer in a direct and concise tone. Your audience is an expert, so be highly specific. If there are ambiguous terms or acronyms, first define them.'''
sys_p
= PromptTemplate.from_template(template="'DOI': '{ske_id}', CITATION: '{citation}', CONTENT:'{page_content}'")
DEFAULT_DOCUMENT_PROMPT def combine_documents(
=DEFAULT_DOCUMENT_PROMPT, document_separator="},{\n"
docs, document_prompt
):= [format_document(doc, document_prompt) for doc in docs]
doc_strings return '[{'+document_separator.join(doc_strings)+'}]'
= ChatPromptTemplate.from_messages([
template "system", sys_p),
("human", hum_p)])
(
= (
qa_chain
RunnableParallel({"k": itemgetter("k"),
"question": itemgetter("question"),
"summary_length": itemgetter("summary_length"),
"context": itemgetter("question") | retriever | combine_documents,
})| {
"summary": template | ChatOllama(model='mixtral') | JsonEnclosedByTextOutputParser(),
"context": itemgetter("context"),
}
)
input = {'question': question, 'summary_length': 1000, 'k':5}
= qa_chain.invoke(input, config={'callbacks': [ConsoleCallbackHandler()]})
out print(json.dumps(out, indent=4))
Discourse Analysis
= '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
# Try an out-of-the-box classifier on the data for discourse tagging.
ldb.session.rollback()= (datetime.now() - timedelta(days=1*365))
one_year_ago
= ldb.session.query(SKE, SKF) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id==SKF.id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKI.type == 'CitationRecord' ) \
.id)
.order_by(SKE.
# .filter(SKC.name == 'The Stiff Person Syndrome' ) \
# .filter(SKE.publication_date >= one_year_ago) \
= []
s_list for e, f in q.all():
for i, s in enumerate(ldb.sent_detector.tokenize(f.content)):
id, f.id, i, s])
s_list.append([e.= pd.DataFrame(s_list, columns=['doi', 'f_id', 's_id', 'text'])
sent_df sent_df
# Predict multipe texts on single CPU and time the inference duration
= time()
start
= sent_df
df
= classifier([row.text for i, row in df.iterrows()])
preds = pd.DataFrame(preds)
pred_df 'label'] = [lookup[row.label] for i, row in pred_df.iterrows()]
df['score'] = [row.score for i, row in pred_df.iterrows()]
df[
= time()
end
print('Prediction time:', str(timedelta(seconds=end-start)))
df
ldb.session.rollback()
# Generate fragment sentences and add them as Notes
ldb.session.rollback()for i, row in df.iterrows():
= ldb.session.query(SKF).filter(SKF.id == row.f_id).first()
f_q = ldb.session.query(SKI).filter(SKI.id == row.f_id.split('.')[0]).first()
i_q = i_q.content.find(row.text)
o = len(row.text)
l = ScientificKnowledgeFragment(id=f_q.id+'.'+str(row.s_id), \
sentence_fragment =row.text, \
content=o, \
offset=l, \
lengthtype='sentence')
i_q.has_part.append(sentence_fragment)= {'discourse_label': row.label, 'score': row.score}
note_content = Note(id=f_q.id+'.'+str(row.s_id)+'.discourse_type',
n =json.dumps(note_content, indent=4),
contentformat='json',
type='NoteAboutFragment')
sentence_fragment.has_notes.append(n)
ldb.session.flush() ldb.session.commit()
Running DRSM Classifiers.
= '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
Topic Modeling over the corpus.
What are the main topics being discussed in each paper?
= '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
Search for and download Full Text Papers.
Can we search for all Stiff Person Syndrome papers published in the last 10 years?
ldb.session.rollback()
= (datetime.now() - timedelta(days=10*365))
ten_years_ago print(ten_years_ago)
= ldb.session.query(func.extract('year', SKE.publication_date.cast(Date)), func.count(SKE.id) ) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.publication_date >= ten_years_ago) \
.filter(SKC.name == 'The Stiff Person Syndrome' ) \
.'year', SKE.publication_date.cast(Date))) \
.group_by(func.extract('year', SKE.publication_date.cast(Date)))
.order_by(func.extract(= pd.DataFrame(q.all(), columns=['doi', 'date'])
sps_pubcount_df sps_pubcount_df
Run PaperQA
'input':'Write a short essay on "What connections between primary ciliary diskinesia and primary cilia have been studied?" based on the collection with ID="70".'}) cb.agent_executor.invoke({