from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database
from alhazen.utils.searchEngineUtils import *
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from rapidfuzz import fuzz
# Plot the distribution of the lengths of the methods sections
import seaborn as sns
import matplotlib.pyplot as plt
import tiktoken
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, AutoModel, AutoTokenizer
import torch
import local_resources.queries.em_tech as em_tech_queries
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
import json
from jsonpath_ng import jsonpath, parse
from langchain_community.chat_models.openai import ChatOpenAICryoET
Introduction to CryoET
Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.
This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, CZII on developing the CryoET data portal, an open source repository for CryoET-based data.
Basics
Python Imports
Setting python imports, environment variables, and other crucial set up parameters here.
Environment Variables
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH- the location on disk where you save temporary files, downloaded models or other data.
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
os.makedirs(os.environ['LOCAL_FILE_PATH'])
loc = os.environ['LOCAL_FILE_PATH']
db_name = 'em_tech'Setup utils, agents, and tools
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llms_lookup = lookup_chat_models()
print(llms_lookup.keys())llm_gpt4_1106 = ChatOpenAI(model='gpt-4-1106-preview')
llm_gpt35 = ChatOpenAI(model='gpt-3.5-turbo')#llm = llms_lookup.get('databricks_llama3')
cb = AlhazenAgent(llm_gpt35, llm_gpt35, db_name=db_name)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
test_tk = MetadataExtractionToolkit(db=ldb, llm=llm_gpt35)
print('\nTESTING TOOLS')
for t in test_tk.get_tools():
print('\t'+type(t).__name__)Set Evaluation Dataset
These are cases directly taken from *.yaml files that
Identify cases from the CZI CryoET Portal.
dois = {10000: ['10.1101/2022.04.12.488077'],
10001: ['10.1101/2022.04.12.488077'],
10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'],
10004: ['10.1101/2023.04.28.538734'],
10005: ['10.1038/s41594-022-00861-0'],
10006: ['10.1038/s41586-020-2665-2'],
10007: [],
10008: ['10.1038/s41586-022-04971-z'],
10009: ['10.1126/science.abm6704'],
10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}
dois_flattened = [doi for doi_list in dois.values() for doi in doi_list]
dois_flattened = list(set(dois_flattened))
dois_flattenedRetrieve Gold Standard experimental metadata from EMPIAR database.
- Download the entire database to a local file from:
https://www.ebi.ac.uk/emdb/search/database:EMPIAR - Save the location in a temporary variable:
empiar_metadata_path - Process the downloaded file for (A) EMDB ids, (B) DOI values for publications.
# local_path to the file downloaded from the EMPIAR search results:
# https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true
# download the file and save it to a local path
url = "https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true"
empiar_metadata_path = loc+db_name+'/EMPIAR_search_results.json'
response = requests.get(url, stream=True)
with open(empiar_metadata_path, "wb") as handle:
for data in response.iter_content():
handle.write(data)
with open(empiar_metadata_path, 'r') as f:
empiar_metadata = json.load(f)
empiar_dataset_ids = list(empiar_metadata.keys())
d = {}
for empiar_id in empiar_dataset_ids:
d[empiar_id] = {'dois':[], 'emd_ids': []}
for citation in empiar_metadata.get(empiar_id, {}).get('citation', []):
if citation.get('doi') is not None:
d[empiar_id]['dois'].append(citation.get('doi'))
for emd_id in empiar_metadata.get(empiar_id, {}).get('cross_references'):
d[empiar_id]['emd_ids'].append(emd_id.get('name'))
def get_nested(data, *args):
if args and data:
element = args[0]
if element:
value = data.get(element)
return value if len(args) == 1 else get_nested(value, *args[1:])# get metadata from the EMDB entries for each case
metadlist = []
# jsonpath expressions to identify specific metadata from the EMDB entries
# focus mainly on the specimen preparation (grids, buffers, vitrification, etc.)
sd_jp = 'structure_determination_list.structure_determination[*]'
sample_preparation_type_jp = parse(sd_jp + '.method')
agg_state_jp = parse(sd_jp + '.aggregation_state')
specprep_list_jp = sd_jp + '.specimen_preparation_list.specimen_preparation[*]'
buffer_jp = parse(specprep_list_jp + '.buffer.ph')
grid_model_jp = parse(specprep_list_jp + '.grid.model')
grid_material_jp = parse(specprep_list_jp + '.grid.material')
grid_mesh_jp = parse(specprep_list_jp + '.grid.mesh')
grid_support_topology_jp = parse(specprep_list_jp + '.grid.support_film[*].film_topology')
grid_pretreatment_jp = parse(specprep_list_jp + '.grid.pretreatment.type_')
grid_vitrification_cryogen_jp = parse(specprep_list_jp + '.vitrification.cryogen_name')
grid_vit_ctemp_jp = specprep_list_jp + '.vitrification.chamber_temperature.'
grid_vit_chumid_jp = specprep_list_jp + '.vitrification.chamber_humidity'
jp_method = parse('structure_determination_list.structure_determination[*]')
i = 0
for k,v in d.items():
#i += 1
#if i > 10:
# break
print(k,v)
for emd_id in v['emd_ids']:
emd_exp = requests.get('https://www.ebi.ac.uk/emdb/api/entry/experiment/'+emd_id)
if emd_exp.status_code == 200:
emd = emd_exp.json()
sample_preparation_type = ', '.join([m.value for m in sample_preparation_type_jp.find(emd)])
agg_state = ', '.join([m.value for m in agg_state_jp.find(emd)])
buffer = ', '.join([str(m.value) for m in buffer_jp.find(emd)])
grid_model = ', '.join([m.value for m in grid_model_jp.find(emd)])
grid_material = ', '.join([m.value for m in grid_material_jp.find(emd)])
grid_mesh = ', '.join([str(m.value) for m in grid_mesh_jp.find(emd)])
grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
grid_pretreatment = ', '.join([m.value for m in grid_pretreatment_jp.find(emd)])
grid_vitrification_cryogen = ', '.join([m.value for m in grid_vitrification_cryogen_jp.find(emd)])
grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
grid_vit_ctemp_units = [m.value for m in parse(grid_vit_ctemp_jp+'.units').find(emd)]
grid_vit_ctemp_values = [str(m.value) for m in parse(grid_vit_ctemp_jp+'.valueOf_').find(emd)]
grid_vit_ctemp = ','.join([t[0]+' '+t[1] for t in zip(grid_vit_ctemp_values, grid_vit_ctemp_units)])
grid_vit_chumid_units = [m.value for m in parse(grid_vit_chumid_jp+'.units').find(emd)]
grid_vit_chumid_values = [str(m.value) for m in parse(grid_vit_chumid_jp+'.valueOf_').find(emd)]
grid_vit_chumid = ', '.join([t[0]+' '+t[1] for t in zip(grid_vit_chumid_values, grid_vit_chumid_units)])
for doi in v['dois']:
metadlist.append({'doi':doi,
'emd_id': emd_id,
'sample_preparation_type': sample_preparation_type,
'agg_state': agg_state,
'sample_preparation_buffer_ph': buffer,
'grid_model': grid_model,
'grid_material': grid_material,
'grid_mesh': grid_mesh,
'grid_support_topology': grid_support_topology,
'grid_pretreatment': grid_pretreatment,
'grid_vitrification_cryogen': grid_vitrification_cryogen,
'grid_vit_ctemp': grid_vit_ctemp,
'grid_vit_chumid': grid_vit_chumid})
else:
print('ERROR: ', emd_exp.status_code)
empiar_df = pd.DataFrame(metadlist)
empiar_df.to_csv(loc+db_name+'/empiar_metadata.tsv', sep='\t', index=False)
empiar_dois = sorted(empiar_df['doi'].unique())
empiar_dfLoad the EMPIAR data from disk
This is from local directory that we just created
empiar_df = pd.read_csv(loc+db_name+'/empiar/empiar_metadata.tsv', sep='\t')
empiar_dois = sorted(empiar_df['doi'].unique())
empiar_dfBuilding the database
Scripts to Build / Delete the database
If you need to restore a deleted database from backup, use the following shell commands:
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])loc = os.environ['LOCAL_FILE_PATH']
current_date_time = datetime.now()
formatted_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
backup_path = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
backup_ceifns_database(db_name, backup_path)create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])Build CEIFNS database from queries
Add a collection based on EMPIAR papers
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20
for start_i in range(0, len(empiar_dois), step):
query = ' OR '.join(['doi:"'+empiar_dois[i]+'"' for i in range(start_i, start_i+step)])
addEMPCCollection_tool.run({'id': '3', 'name':'EMPIAR Papers', 'query':query, 'full_text':True})def join_set(x):
out = ''
try:
out = ' '.join(set(x))
except:
pass
return out
# identify papers that we have full text for in EMPIAR
q = ldb.session.query(SKE.id) \
.distinct() \
.filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKC.id == '3') \
.filter(or_(SKI.type == 'JATSFullText', SKI.type == 'PDFFullText'))
dois_to_include = [d[0][4:] for d in q.all()]
empiar_gold_standard = []
for i, row in empiar_df.iterrows():
if row.doi in dois_to_include:
empiar_gold_standard.append( row.to_dict() )
empiar_gold_standard_df = pd.DataFrame(empiar_gold_standard)
empiar_gs_df = empiar_gold_standard_df.groupby(['doi']).agg({'sample_preparation_type': join_set,
'agg_state': join_set,
'sample_preparation_buffer_ph': join_set,
'grid_model': join_set,
'grid_material': join_set,
'grid_mesh': join_set,
'grid_support_topology': join_set,
'grid_pretreatment': join_set,
'grid_vitrification_cryogen': join_set,
'grid_vit_ctemp': join_set,
'grid_vit_chumid': join_set}).reset_index()
empiar_gs_dfImport papers from DOIs pertaining to CryoET-Portal records 10000-10010
The CryoET Data portal system is based on submitted data to our curation team, accompanied by papers referenced by DOIs. Each dataset is assigned an ID value associated with DOIs.
# use the EMPCSearchTool to run a query for the dois mentioned
query = ' OR '.join(['doi:"'+d+'"' for d_id in dois for d in dois[d_id] ])
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '0', 'name':'CryoET Portal (10000-10010)', 'query':query, 'full_text':True})Extend Database to include all CryoET papers
cols_to_include = ['ID', 'CORPUS_NAME', 'QUERY']
df = pd.read_csv(files(em_tech_queries).joinpath('EM_Methods.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])
dfqt = QueryTranslator(df.sort_values('ID'), 'ID', 'QUERY', 'CORPUS_NAME')
(corpus_ids, epmc_queries) = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS', 'METHODS'])
corpus_names = df['CORPUS_NAME']
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
if id != 2:
continue
addEMPCCollection_tool.run(tool_input={'id': id, 'name':name, 'query':query, 'full_text':False})Combine + Sample CryoET + EMPIAR Collections to provide a test set of papers.
ldb.create_new_collection_from_intersection('4', 'EMPIAR CryoET Papers', '2', '3')Adding Machine Learning
ml_query = '''
("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
"Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or
"cryo-electron tomography" OR "cryo-et" OR cryoet ) AND
("Machine Learning" OR "Artificial Intelligence" OR "Deep Learning" OR "Neural Networks")
'''
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '6',
'name': 'Machine Learning in CryoET',
'query': ml_query,
'full_text': False})delCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
delCollection_tool.run(tool_input={'collection_id': '6'})Break up TIAB of papers into sentences + classify by discourse
NOTE - HUGGING FACE MODELS DO NOT WORK WELL ON THIS CORPUS. (NOT SURPRISINGLY - THEY WERE TRAINED ON MEDICAL PAPERS WHERE THE DIFFERENT SECTIONS OF THE PAPER WERE EXPLICITLY LABELED)
USE LLMS TO DO THE EXTRACTION - GPT3.5?
# Get the metadata extraction tool
t2 = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
t2.run(tool_input={'collection_id': '5', 'run_label': 'dev'})j = '''{
"Background": "Eps15-homology domain containing proteins (EHDs) are eukaryotic, dynamin-related ATPases involved in cellular membrane trafficking. They oligomerize on membranes into filaments that induce membrane tubulation. While EHD crystal structures in open and closed conformations were previously reported, little structural information is available for the membrane-bound oligomeric form. Consequently, mechanistic insights into the membrane remodeling mechanism have remained sparse.",
"Objectives_Methods": "Here, by using cryo-electron tomography and subtomogram averaging, we determined structures of nucleotide-bound EHD4 filaments on membrane tubes of various diameters at an average resolution of 7.6 Å.",
"Results_Conclusions": "Assembly of EHD4 is mediated via interfaces in the G-domain and the helical domain. The oligomerized EHD4 structure resembles the closed conformation, where the tips of the helical domains protrude into the membrane. The variation in filament geometry and tube radius suggests a spontaneous filament curvature of approximately 1/70 nm<sup>-1</sup>. Combining the available structural and functional data, we suggest a model for EHD-mediated membrane remodeling."
}'''
json.loads(j)# Get the metadata extraction tool
models = ['databricks_dbrx']
for m in models:
llm = llms_lookup.get(m)
cb = AlhazenAgent(llm, llm, db_name=db_name)
t2 = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
t2.run(tool_input={'collection_id': '2', 'run_label': m, 'repeat_run': False})to_remove = ["doi:10.1101/2024.03.04.583254", "doi:10.1101/2023.11.21.567712",
"doi:10.3791/6515", "doi:10.1101/2023.07.28.550950",
"doi:10.1093/micmic/ozad067.483", "doi:10.1007/978-1-0716-2639-9_20",
"doi:10.1016/j.yjsbx.2022.100076", "doi:10.1016/j.xpro.2022.101658",
"doi:10.1016/j.cell.2022.06.034", "doi:10.1093/plphys/kiab449",
"doi:10.1073/pnas.2118020118", "doi:10.3791/62886",
"doi:10.20944/preprints202105.0098.v1", "doi:10.1016/bs.mcb.2020.12.009",
"doi:10.1007/978-1-0716-0966-8_1", "doi:10.1007/978-1-0716-0966-8_2",
"doi:10.21769/bioprotoc.3768", "doi:10.1371/journal.ppat.1008883",
"doi:10.1101/2020.05.19.104828", "doi:10.1073/pnas.1916331116",
"doi:10.1042/bst20170351_cor", "doi:10.1038/s41594-018-0043-7",
"doi:10.1007/978-1-4939-8585-2_4", "doi:10.1007/s41048-017-0040-0",
"doi:10.1007/978-1-4939-6927-2_20", "doi:10.1016/j.str.2015.03.008",
"doi:10.1007/978-1-62703-227-8_4", "doi:10.1016/b978-0-12-397945-2.00017-2",
"doi:10.1016/j.jmb.2010.10.021", "doi:10.1186/1757-5036-3-6",
"doi:10.1016/j.jmb.2008.03.014", "doi:10.1007/978-1-59745-294-6_20"]
for d in to_remove:
q = """
SELECT DISTINCT n.id FROM langchain_pg_embedding as emb, "Note" as n
WHERE emb.cmetadata->>'n_type' = 'TiAbMappingNote__Discourse' AND
emb.cmetadata->>'about_id' = '{}' AND
emb.cmetadata->>'discourse_type' = 'ResultsConclusions' AND
emb.cmetadata->>'n_id' = n.id;""".format(d)
for row in ldb.session.execute(text(q)).all():
ldb.delete_note(row[0], commit_this=True)ldb.session.rollback()
exp_q = ldb.session.query(SKE) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.type=='CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
.order_by(desc(SKE.publication_date))
count = 0
for e in tqdm(exp_q.all()):
q = ldb.session.query(N) \
.filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == e.id) \
.filter(N.type =='TiAbMappingNote__Discourse')
for n in q.all():
dmap = json.loads(n.content)
if 'Objectives_Methods' in dmap.keys():
print('beep')
#new_dmap = {'Background': dmap.get('Background'), 'ObjectivesMethods': dmap.get('Objectives_Methods'), 'ResultsConclusions': dmap.get('Results_Conclusions')}
#n.content = json.dumps(new_dmap, indent=4)
#ldb.session.flush()
#ldb.session.commit()ldb.session.rollback()
exp_q = ldb.session.query(SKE) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.type=='CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
.order_by(desc(SKE.publication_date))
texts = []
metadatas = []
count = 0
for e in tqdm(exp_q.all()):
q = ldb.session.query(N) \
.filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == e.id) \
.filter(N.type =='TiAbMappingNote__Discourse')
n = q.first()
dmap = json.loads(n.content)
'''Runs through the list of expressions, generates embeddings, and stores them in the database'''
for dtype in ['Background', 'ObjectivesMethods', 'ResultsConclusions']:
t = dmap.get(dtype)
if t is None:
continue
texts.append(t)
metadatas.append({'about_id': e.id, \
'about_type': 'ScientificKnowledgeExpression', \
'n_id': n.id, \
'n_type': 'TiAbMappingNote__Discourse', \
'discourse_type': dtype})
docs = []
for t,m in zip(texts, metadatas):
docs.append(Document(page_content=t, metadata=m))
db = PGVector.from_documents(
embedding=ldb.embed_model,
documents=docs,
collection_name="Note"
)model_path = '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
truncation=True,
max_length=512)
labels = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
lookup = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
model = AutoModel.from_pretrained(model_path)
model.eval()
classifier = pipeline("text-classification",
model = model_path,
tokenizer=tokenizer,
truncation=True,
batch_size=8,
device='mps')self = ldb
collection_id = '2'
q1 = self.session.query(SKE, SKI) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint'))
for ske, ski in tqdm(q1.all()):
b = ''
om = ''
rc = ''
fragments = []
for f in ski.has_part:
if f.type in ['title', 'abstract']:
fragments.append(f)
# USE AN LLM HERE INSTEAD OF A DEEP LEARNING CLASSIFER
for skf in sorted(fragments, key=lambda f: f.offset):
for s in self.sent_detector.tokenize(skf.content):
m = classifier(skf.content)
l = lookup.get(m[0].get('label'))
if l == 'BACKGROUND':
if len(b) > 0:
b += '\n'
b += s
elif l == 'OBJECTIVE' or l == 'METHODS':
if len(om) > 0:
om += '\n'
om += s
else:
if len(rc) > 0:
rc += '\n'
rc += s
skf_stem = ske.id+'.'+ski.type+'.'
if len(b) > 0:
f_b = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
type='background_sentences', offset=-1, length=len(b),
name=skf_stem+'background', content=b)
self.session.add(f_b)
ski.has_part.append(f_b)
f_b.part_of = ski.id
if len(om) > 0:
f_om = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
type='objective_methods_sentences', offset=-1, length=len(om),
name=skf_stem+'objective_methods', content=om)
self.session.add(f_om)
ski.has_part.append(f_om)
f_om.part_of = ski.id
if len(rc) > 0:
f_rc = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
type='results_conclusions_sentences', offset=-1, length=len(rc),
name=skf_stem+'results_conclusions', content=rc)
self.session.add(f_rc)
ski.has_part.append(f_rc)
f_rc.part_of = ski.id
self.session.flush()
self.session.commit()self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKF) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(or_(SKF.type == 'results_conclusions_sentences', \
SKF.type == 'objective_methods_sentences', \
SKF.type == 'background_sentences'))
for skf in tqdm(q2.all()):
self.delete_fragment(skf.id)self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKE, SKF) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(SKF.type == 'objective_methods_sentences') \
.order_by(desc(SKE.publication_date)) \
.order_by(SKF.name)
for ske, skf in tqdm(q2.all()):
print(skf)Get full text copies of all the papers about CryoET
cb.agent_executor.invoke({'input':'Get full text copies of all papers in the collection with id="2".'})ldb.create_new_collection_from_sample('5', 'EMPIAR CryoET Papers Tests', '4', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint'])Analyze Collections
q = ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
.filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id)
df = pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])
df.pivot_table(index=['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique())).fillna(0)Survey + Run Classifications over Papers
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
l = []
q = ldb.session.query(N, SKE) \
.filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
output = []
print(len(q.all()))
for n, ske in q.all():
ldb.delete_note(n.id)
print(len(q.all()))t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'classification_type':'cryoet_study_types', 'repeat_run':True})t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '2', 'classification_type':'cryoet_study_types'})l = []
ldb.session.rollback()
q = ldb.session.query(N, SKE) \
.join(NIA, NIA.Note_id == N.id) \
.join(SKE, SKE.id == NIA.is_about_id) \
.join(SKC_HM, SKE.id == SKC_HM.has_members_id) \
.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
.filter(SKC_HM.ScientificKnowledgeCollection_id == '5') \
.order_by(SKE.id, N.provenance)
output = []
for n, ske in q.all():
tup = json.loads(n.content)
tup['doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
tup['year'] = ske.publication_date.year
tup['month'] = ske.publication_date.month
tup['ref'] = ske.content
output.append(tup)
df = pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df.to_csv(loc+'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
dfstudy_type_lookup = {'A': 'Viral Pathogens',
'B': "Mutated protein structure",
'C': 'Bacterial pathogens',
'D': 'Plant cells',
'E': 'Material science',
'F': 'Intracellular Transport Structure',
'G': 'Synapses or Vesicle Release',
'H': 'Other Intracellular Structure',
'I': 'Cellular Processes',
'J': 'Dynamics of molecular interactions',
'K': 'New CryoET imaging methods',
'L': 'New data analysis methods'}
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20
for k in study_type_lookup.keys():
df1 = df[df['cryoet_study_type_code'] == k]
dois_to_add = [re.sub('http://doi.org/', 'doi:', r.doi) for i, r in df1.iterrows()]
c_id = '2.'+k
c_name = 'CryoET - ' + study_type_lookup[k]
corpus = None
all_existing_query = ldb.session.query(SKC).filter(SKC.id==c_id)
for c in all_existing_query.all():
corpus = c
if corpus is None:
corpus = ScientificKnowledgeCollection(id=c_id,
type='skem:ScientificKnowledgeCollection',
name=c_name,
has_members=[])
ldb.session.add(corpus)
ldb.session.flush()
for doi in tqdm(dois_to_add):
p = ldb.session.query(SKE) \
.filter(SKE.id==doi).first()
if p is None:
continue
ldb.session.add(p)
corpus.has_members.append(p)
p.member_of.append(corpus)
ldb.session.flush()
ldb.session.commit()delete_collection_tool = [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
for k in study_type_lookup.keys():
print(k)
delete_collection_tool.run({'collection_id': '2.'+k})Survey + Run Extractions over Papers
t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractExtraction_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'extraction_type':'cryoet'})Tests + Checks
Agent tool selection + execution + interpretation
cb.agent_executor.invoke({'input':'Hi who are you and what can you do?'})Run MetaData Extraction Chain over listed papers
Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.
q = ldb.session.query(SKE.id) \
.filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKC.id=='5')
dois = [e.id for e in q.all()]
dois# need to count tokens submitted to the server as a way of tracking usage.
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device='mps', token=os.environ['HF_API_KEY'])
prompt = "The methods section of the paper is as follows:"
tokenized = tokenizer(prompt, return_tensors="pt")
print(len(tokenized["input_ids"][0]))# How long are methods sections in the CryoET papers?
ldb.session.rollback()
q = ldb.session.query(SKE.id) \
.filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKC.id=='2') \
.filter(or_(SKE.type=='ScientificPrimaryResearchArticle', SKE.type=='ScientificPrimaryResearchPreprint'))
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
tups = []
for e in tqdm(q.all()):
item_types = set()
item_type = None
for i in ldb.list_items_for_expression(e.id):
item_types.add(i.type)
for i_type in item_types:
if i_type == 'CitationRecord':
continue
item_type = i_type
break
if item_type is None:
continue
fragments = [f.content for f in ldb.list_fragments_for_paper(e.id, item_type, fragment_types=['section'])]
on_off = False
text = ''
all_text = ''
for t in fragments:
all_text += t
l1 = t.split('\n')[0].lower()
if 'method' in l1:
on_off = True
elif 'results' in l1 or 'discussion' in l1 or 'conclusion' in l1 or 'acknowledgements' in l1 \
or 'references' in l1 or 'supplementary' in l1 or 'appendix' in l1 or 'introduction' in l1 or 'abstract' in l1 or 'cited' in l1:
on_off = False
if on_off:
if len(text) > 0:
text += '\n\n'
text += t
all_text_length = len(tokenizer(all_text, return_tensors="pt")['input_ids'][0])
text_length = len(tokenizer(text, return_tensors="pt")['input_ids'][0])
tups.append({'doi':e.id, 'doc_length': all_text_length, 'method_length': text_length})
df_length = pd.DataFrame(tups)
df_lengthprint(len(df_length[df_length['method_length']>8000]))
print(len(df_length[df_length['method_length']<8000]))
def plot_length_distribution(df_length):
plt.hist(df_length, bins=10)
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Distribution of Lengths')
plt.show()
plot_length_distribution(df_length['method_length'])t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]for i, r in tqdm(df_length.iterrows()):
if len(df[df['doi']==r['doi']]) > 0:
continue
# Run the metadata extraction tool on the doi
try:
t2.run(tool_input={'paper_id': r['doi'], 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
except Exception as e:
print(e)
continue# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df2 = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
item_types = set()
l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test')
if(len(l) == 0):
continue
df2 = pd.concat([df2, pd.DataFrame(l)])# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
if r['method_length'] < 8000:
item_types = set()
l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test_llama3')
if(len(l) == 0):
continue
df = pd.concat([df, pd.DataFrame(l)])
dfdf[df['doi']=='doi:10.1101/2022.04.12.488077']df2[df2['doi']=='doi:10.1101/2022.04.12.488077']t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'
t2.compile_answers('cryoet', metadata_dir)
t2.write_answers_as_notes('cryoet', metadata_dir)
#sorted(list(set([doi for q in t2.examples for doi in t2.examples[q]])))# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
# Hack to get the path to the metadata directory as a string
#metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'
# Compile the answers from the metadata directory
t2.compile_answers('cryoet', metadata_dir)
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for d in [d for d in dois]:
item_types = set()
l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
df = pd.concat([df, pd.DataFrame(l)])
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
item_types = set()
# Skip if the doi is already in the database
if len(df)>0 and d in df.doi.unique():
continue
# Run the metadata extraction tool on the doi
t2.run(tool_input={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
# Add the results to the dataframe
l2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
df = pd.concat([df, pd.DataFrame(l2)])# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
df_final = pd.DataFrame()
for d in [d for d in dois]:
item_types = set()
l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
df_final = pd.concat([df_final, pd.DataFrame(l)])
df_final# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
l = []
for d in [d for d in dois]:
item_types = set()
pred1 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
pred2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test_dbrx')
gold = t2.read_metadata_extraction_notes(d, 'cryoet', 'gold')
if pred1 is None or pred2 is None or gold is None or \
len(pred1)==0 or len(pred2)==0 or len(gold)!=1:
continue
for k in gold[0]:
g_case = gold[0][k]
if g_case=='' or g_case is None:
continue
for j, p_case in enumerate(pred1):
sim = fuzz.ratio(str(g_case), str(p_case.get(k,''))) / 100.0
print(k, str(g_case), str(p_case.get(k,'')), sim)# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
df = t2.report_metadata_extraction_for_collection('5', 'cryoet', 'test').set_index('doi')
df.to_csv(loc+'/'+db_name+'/reports/cryoet_metadata_gpt4.tsv', sep='\t')ldb.create_zip_archive_of_full_text_files('5', loc+'/'+db_name+'/full_text_files.zip')q3 = ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
.filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'MetadataExtractionNote')
l = []
for row in q3.all():
paper = row[0]
name = row[1]
# provenance = json.loads(row[2])
result = json.loads(row[3])
kv = {k:result[k] for k in result}
kv['DOI'] = paper
kv['run'] = name
l.append(kv)
# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
df = pd.DataFrame(l).set_index(['DOI', 'run'])
else:
df = pd.DataFrame()
df# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for row in q3.all():
d_id = row[0]
e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
notes_to_delete = []
for n in ldb.read_notes_about_x(e):
notes_to_delete.append(n.id)
for n in notes_to_delete:
ldb.delete_note(n)Protocol Modeling + Extraction
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
slm = ChatOllama(model='stablelm-zephyr')
llm = ChatOllama(model='mixtral:instruct')
llm2 = ChatOpenAI(model='gpt-4-1106-preview')
llm3 = ChatOpenAI(model='gpt-3.5-turbo')
d = ("This tool attempts to draw a protocol design from the description of a scientific paper.")t1 = ProcotolEntitiesExtractionTool(db=ldb, llm=llm3, description=d)
entities = t1.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
entitiest2 = ProcotolProcessesExtractionTool(db=ldb, llm=llm3, description=d)
processes = t2.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
processes.get('data')