from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database
from alhazen.utils.searchEngineUtils import *
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from rapidfuzz import fuzz
# Plot the distribution of the lengths of the methods sections
import seaborn as sns
import matplotlib.pyplot as plt
import tiktoken
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, AutoModel, AutoTokenizer
import torch
import local_resources.queries.em_tech as em_tech_queries
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
import json
from jsonpath_ng import jsonpath, parse
from langchain_community.chat_models.openai import ChatOpenAI
CryoET
Introduction to CryoET
Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.
This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, CZII on developing the CryoET data portal, an open source repository for CryoET-based data.
Basics
Python Imports
Setting python imports, environment variables, and other crucial set up parameters here.
Environment Variables
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
= os.environ['LOCAL_FILE_PATH']
loc = 'em_tech' db_name
Setup utils, agents, and tools
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = lookup_chat_models()
llms_lookup print(llms_lookup.keys())
= ChatOpenAI(model='gpt-4-1106-preview')
llm_gpt4_1106 = ChatOpenAI(model='gpt-3.5-turbo') llm_gpt35
#llm = llms_lookup.get('databricks_llama3')
= AlhazenAgent(llm_gpt35, llm_gpt35, db_name=db_name)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
= MetadataExtractionToolkit(db=ldb, llm=llm_gpt35)
test_tk print('\nTESTING TOOLS')
for t in test_tk.get_tools():
print('\t'+type(t).__name__)
Set Evaluation Dataset
These are cases directly taken from *.yaml
files that
Identify cases from the CZI CryoET Portal.
= {10000: ['10.1101/2022.04.12.488077'],
dois 10001: ['10.1101/2022.04.12.488077'],
10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'],
10004: ['10.1101/2023.04.28.538734'],
10005: ['10.1038/s41594-022-00861-0'],
10006: ['10.1038/s41586-020-2665-2'],
10007: [],
10008: ['10.1038/s41586-022-04971-z'],
10009: ['10.1126/science.abm6704'],
10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}
= [doi for doi_list in dois.values() for doi in doi_list]
dois_flattened = list(set(dois_flattened))
dois_flattened dois_flattened
Retrieve Gold Standard experimental metadata from EMPIAR database.
- Download the entire database to a local file from:
https://www.ebi.ac.uk/emdb/search/database:EMPIAR
- Save the location in a temporary variable:
empiar_metadata_path
- Process the downloaded file for (A) EMDB ids, (B) DOI values for publications.
# local_path to the file downloaded from the EMPIAR search results:
# https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true
# download the file and save it to a local path
= "https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true"
url = loc+db_name+'/EMPIAR_search_results.json'
empiar_metadata_path = requests.get(url, stream=True)
response with open(empiar_metadata_path, "wb") as handle:
for data in response.iter_content():
handle.write(data)
with open(empiar_metadata_path, 'r') as f:
= json.load(f)
empiar_metadata = list(empiar_metadata.keys())
empiar_dataset_ids = {}
d for empiar_id in empiar_dataset_ids:
= {'dois':[], 'emd_ids': []}
d[empiar_id] for citation in empiar_metadata.get(empiar_id, {}).get('citation', []):
if citation.get('doi') is not None:
'dois'].append(citation.get('doi'))
d[empiar_id][for emd_id in empiar_metadata.get(empiar_id, {}).get('cross_references'):
'emd_ids'].append(emd_id.get('name'))
d[empiar_id][
def get_nested(data, *args):
if args and data:
= args[0]
element if element:
= data.get(element)
value return value if len(args) == 1 else get_nested(value, *args[1:])
# get metadata from the EMDB entries for each case
= []
metadlist
# jsonpath expressions to identify specific metadata from the EMDB entries
# focus mainly on the specimen preparation (grids, buffers, vitrification, etc.)
= 'structure_determination_list.structure_determination[*]'
sd_jp = parse(sd_jp + '.method')
sample_preparation_type_jp = parse(sd_jp + '.aggregation_state')
agg_state_jp = sd_jp + '.specimen_preparation_list.specimen_preparation[*]'
specprep_list_jp = parse(specprep_list_jp + '.buffer.ph')
buffer_jp = parse(specprep_list_jp + '.grid.model')
grid_model_jp = parse(specprep_list_jp + '.grid.material')
grid_material_jp = parse(specprep_list_jp + '.grid.mesh')
grid_mesh_jp = parse(specprep_list_jp + '.grid.support_film[*].film_topology')
grid_support_topology_jp = parse(specprep_list_jp + '.grid.pretreatment.type_')
grid_pretreatment_jp = parse(specprep_list_jp + '.vitrification.cryogen_name')
grid_vitrification_cryogen_jp = specprep_list_jp + '.vitrification.chamber_temperature.'
grid_vit_ctemp_jp = specprep_list_jp + '.vitrification.chamber_humidity'
grid_vit_chumid_jp
= parse('structure_determination_list.structure_determination[*]')
jp_method = 0
i for k,v in d.items():
#i += 1
#if i > 10:
# break
print(k,v)
for emd_id in v['emd_ids']:
= requests.get('https://www.ebi.ac.uk/emdb/api/entry/experiment/'+emd_id)
emd_exp if emd_exp.status_code == 200:
= emd_exp.json()
emd = ', '.join([m.value for m in sample_preparation_type_jp.find(emd)])
sample_preparation_type = ', '.join([m.value for m in agg_state_jp.find(emd)])
agg_state buffer = ', '.join([str(m.value) for m in buffer_jp.find(emd)])
= ', '.join([m.value for m in grid_model_jp.find(emd)])
grid_model = ', '.join([m.value for m in grid_material_jp.find(emd)])
grid_material = ', '.join([str(m.value) for m in grid_mesh_jp.find(emd)])
grid_mesh = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
grid_support_topology = ', '.join([m.value for m in grid_pretreatment_jp.find(emd)])
grid_pretreatment = ', '.join([m.value for m in grid_vitrification_cryogen_jp.find(emd)])
grid_vitrification_cryogen = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
grid_support_topology
= [m.value for m in parse(grid_vit_ctemp_jp+'.units').find(emd)]
grid_vit_ctemp_units = [str(m.value) for m in parse(grid_vit_ctemp_jp+'.valueOf_').find(emd)]
grid_vit_ctemp_values = ','.join([t[0]+' '+t[1] for t in zip(grid_vit_ctemp_values, grid_vit_ctemp_units)])
grid_vit_ctemp
= [m.value for m in parse(grid_vit_chumid_jp+'.units').find(emd)]
grid_vit_chumid_units = [str(m.value) for m in parse(grid_vit_chumid_jp+'.valueOf_').find(emd)]
grid_vit_chumid_values = ', '.join([t[0]+' '+t[1] for t in zip(grid_vit_chumid_values, grid_vit_chumid_units)])
grid_vit_chumid
for doi in v['dois']:
'doi':doi,
metadlist.append({'emd_id': emd_id,
'sample_preparation_type': sample_preparation_type,
'agg_state': agg_state,
'sample_preparation_buffer_ph': buffer,
'grid_model': grid_model,
'grid_material': grid_material,
'grid_mesh': grid_mesh,
'grid_support_topology': grid_support_topology,
'grid_pretreatment': grid_pretreatment,
'grid_vitrification_cryogen': grid_vitrification_cryogen,
'grid_vit_ctemp': grid_vit_ctemp,
'grid_vit_chumid': grid_vit_chumid})
else:
print('ERROR: ', emd_exp.status_code)
= pd.DataFrame(metadlist)
empiar_df +db_name+'/empiar_metadata.tsv', sep='\t', index=False)
empiar_df.to_csv(loc= sorted(empiar_df['doi'].unique())
empiar_dois empiar_df
Load the EMPIAR data from disk
This is from local directory that we just created
= pd.read_csv(loc+db_name+'/empiar/empiar_metadata.tsv', sep='\t')
empiar_df = sorted(empiar_df['doi'].unique())
empiar_dois empiar_df
Building the database
Scripts to Build / Delete the database
If you need to restore a deleted database from backup, use the following shell commands:
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
= os.environ['LOCAL_FILE_PATH']
loc = datetime.now()
current_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
formatted_date_time = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
backup_path backup_ceifns_database(db_name, backup_path)
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
Build CEIFNS database from queries
Add a collection based on EMPIAR papers
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool = 20
step for start_i in range(0, len(empiar_dois), step):
= ' OR '.join(['doi:"'+empiar_dois[i]+'"' for i in range(start_i, start_i+step)])
query 'id': '3', 'name':'EMPIAR Papers', 'query':query, 'full_text':True}) addEMPCCollection_tool.run({
def join_set(x):
= ''
out try:
= ' '.join(set(x))
out except:
pass
return out
# identify papers that we have full text for in EMPIAR
= ldb.session.query(SKE.id) \
q \
.distinct() filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id) \
.filter(SKC.id == '3') \
.filter(or_(SKI.type == 'JATSFullText', SKI.type == 'PDFFullText'))
.= [d[0][4:] for d in q.all()]
dois_to_include
= []
empiar_gold_standard for i, row in empiar_df.iterrows():
if row.doi in dois_to_include:
empiar_gold_standard.append( row.to_dict() )= pd.DataFrame(empiar_gold_standard)
empiar_gold_standard_df
= empiar_gold_standard_df.groupby(['doi']).agg({'sample_preparation_type': join_set,
empiar_gs_df 'agg_state': join_set,
'sample_preparation_buffer_ph': join_set,
'grid_model': join_set,
'grid_material': join_set,
'grid_mesh': join_set,
'grid_support_topology': join_set,
'grid_pretreatment': join_set,
'grid_vitrification_cryogen': join_set,
'grid_vit_ctemp': join_set,
'grid_vit_chumid': join_set}).reset_index()
empiar_gs_df
Import papers from DOIs pertaining to CryoET-Portal records 10000-10010
The CryoET Data portal system is based on submitted data to our curation team, accompanied by papers referenced by DOIs. Each dataset is assigned an ID value associated with DOIs.
# use the EMPCSearchTool to run a query for the dois mentioned
= ' OR '.join(['doi:"'+d+'"' for d_id in dois for d in dois[d_id] ])
query = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool ={'id': '0', 'name':'CryoET Portal (10000-10010)', 'query':query, 'full_text':True}) addEMPCCollection_tool.run(tool_input
Extend Database to include all CryoET papers
= ['ID', 'CORPUS_NAME', 'QUERY']
cols_to_include = pd.read_csv(files(em_tech_queries).joinpath('EM_Methods.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])
df df
= QueryTranslator(df.sort_values('ID'), 'ID', 'QUERY', 'CORPUS_NAME')
qt = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS', 'METHODS'])
(corpus_ids, epmc_queries) = df['CORPUS_NAME']
corpus_names
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
if id != 2:
continue
={'id': id, 'name':name, 'query':query, 'full_text':False}) addEMPCCollection_tool.run(tool_input
Combine + Sample CryoET + EMPIAR Collections to provide a test set of papers.
'4', 'EMPIAR CryoET Papers', '2', '3') ldb.create_new_collection_from_intersection(
Adding Machine Learning
= '''
ml_query ("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
"Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or
"cryo-electron tomography" OR "cryo-et" OR cryoet ) AND
("Machine Learning" OR "Artificial Intelligence" OR "Deep Learning" OR "Neural Networks")
'''
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool ={'id': '6',
addEMPCCollection_tool.run(tool_input'name': 'Machine Learning in CryoET',
'query': ml_query,
'full_text': False})
= [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
delCollection_tool ={'collection_id': '6'}) delCollection_tool.run(tool_input
Break up TIAB of papers into sentences + classify by discourse
NOTE - HUGGING FACE MODELS DO NOT WORK WELL ON THIS CORPUS. (NOT SURPRISINGLY - THEY WERE TRAINED ON MEDICAL PAPERS WHERE THE DIFFERENT SECTIONS OF THE PAPER WERE EXPLICITLY LABELED)
USE LLMS TO DO THE EXTRACTION - GPT3.5?
# Get the metadata extraction tool
= [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
t2 ={'collection_id': '5', 'run_label': 'dev'}) t2.run(tool_input
= '''{
j "Background": "Eps15-homology domain containing proteins (EHDs) are eukaryotic, dynamin-related ATPases involved in cellular membrane trafficking. They oligomerize on membranes into filaments that induce membrane tubulation. While EHD crystal structures in open and closed conformations were previously reported, little structural information is available for the membrane-bound oligomeric form. Consequently, mechanistic insights into the membrane remodeling mechanism have remained sparse.",
"Objectives_Methods": "Here, by using cryo-electron tomography and subtomogram averaging, we determined structures of nucleotide-bound EHD4 filaments on membrane tubes of various diameters at an average resolution of 7.6 Å.",
"Results_Conclusions": "Assembly of EHD4 is mediated via interfaces in the G-domain and the helical domain. The oligomerized EHD4 structure resembles the closed conformation, where the tips of the helical domains protrude into the membrane. The variation in filament geometry and tube radius suggests a spontaneous filament curvature of approximately 1/70 nm<sup>-1</sup>. Combining the available structural and functional data, we suggest a model for EHD-mediated membrane remodeling."
}'''
json.loads(j)
# Get the metadata extraction tool
= ['databricks_dbrx']
models for m in models:
= llms_lookup.get(m)
llm = AlhazenAgent(llm, llm, db_name=db_name)
cb = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
t2 ={'collection_id': '2', 'run_label': m, 'repeat_run': False}) t2.run(tool_input
= ["doi:10.1101/2024.03.04.583254", "doi:10.1101/2023.11.21.567712",
to_remove "doi:10.3791/6515", "doi:10.1101/2023.07.28.550950",
"doi:10.1093/micmic/ozad067.483", "doi:10.1007/978-1-0716-2639-9_20",
"doi:10.1016/j.yjsbx.2022.100076", "doi:10.1016/j.xpro.2022.101658",
"doi:10.1016/j.cell.2022.06.034", "doi:10.1093/plphys/kiab449",
"doi:10.1073/pnas.2118020118", "doi:10.3791/62886",
"doi:10.20944/preprints202105.0098.v1", "doi:10.1016/bs.mcb.2020.12.009",
"doi:10.1007/978-1-0716-0966-8_1", "doi:10.1007/978-1-0716-0966-8_2",
"doi:10.21769/bioprotoc.3768", "doi:10.1371/journal.ppat.1008883",
"doi:10.1101/2020.05.19.104828", "doi:10.1073/pnas.1916331116",
"doi:10.1042/bst20170351_cor", "doi:10.1038/s41594-018-0043-7",
"doi:10.1007/978-1-4939-8585-2_4", "doi:10.1007/s41048-017-0040-0",
"doi:10.1007/978-1-4939-6927-2_20", "doi:10.1016/j.str.2015.03.008",
"doi:10.1007/978-1-62703-227-8_4", "doi:10.1016/b978-0-12-397945-2.00017-2",
"doi:10.1016/j.jmb.2010.10.021", "doi:10.1186/1757-5036-3-6",
"doi:10.1016/j.jmb.2008.03.014", "doi:10.1007/978-1-59745-294-6_20"]
for d in to_remove:
= """
q SELECT DISTINCT n.id FROM langchain_pg_embedding as emb, "Note" as n
WHERE emb.cmetadata->>'n_type' = 'TiAbMappingNote__Discourse' AND
emb.cmetadata->>'about_id' = '{}' AND
emb.cmetadata->>'discourse_type' = 'ResultsConclusions' AND
emb.cmetadata->>'n_id' = n.id;""".format(d)
for row in ldb.session.execute(text(q)).all():
0], commit_this=True) ldb.delete_note(row[
ldb.session.rollback()= ldb.session.query(SKE) \
exp_q filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.type=='CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
.
.order_by(desc(SKE.publication_date))
= 0
count for e in tqdm(exp_q.all()):
= ldb.session.query(N) \
q filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == e.id) \
.filter(N.type =='TiAbMappingNote__Discourse')
.for n in q.all():
= json.loads(n.content)
dmap if 'Objectives_Methods' in dmap.keys():
print('beep')
#new_dmap = {'Background': dmap.get('Background'), 'ObjectivesMethods': dmap.get('Objectives_Methods'), 'ResultsConclusions': dmap.get('Results_Conclusions')}
#n.content = json.dumps(new_dmap, indent=4)
#ldb.session.flush()
#ldb.session.commit()
ldb.session.rollback()= ldb.session.query(SKE) \
exp_q filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.type=='CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
.
.order_by(desc(SKE.publication_date))
= []
texts = []
metadatas
= 0
count for e in tqdm(exp_q.all()):
= ldb.session.query(N) \
q filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == e.id) \
.filter(N.type =='TiAbMappingNote__Discourse')
.
= q.first()
n = json.loads(n.content)
dmap '''Runs through the list of expressions, generates embeddings, and stores them in the database'''
for dtype in ['Background', 'ObjectivesMethods', 'ResultsConclusions']:
= dmap.get(dtype)
t if t is None:
continue
texts.append(t)'about_id': e.id, \
metadatas.append({'about_type': 'ScientificKnowledgeExpression', \
'n_id': n.id, \
'n_type': 'TiAbMappingNote__Discourse', \
'discourse_type': dtype})
= []
docs for t,m in zip(texts, metadatas):
=t, metadata=m))
docs.append(Document(page_content
= PGVector.from_documents(
db =ldb.embed_model,
embedding=docs,
documents="Note"
collection_name )
= '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
model_path = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1",
tokenizer =True,
truncation=512)
max_length= ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
labels = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
lookup = AutoModel.from_pretrained(model_path)
model eval()
model.
= pipeline("text-classification",
classifier = model_path,
model =tokenizer,
tokenizer=True,
truncation=8,
batch_size='mps') device
self = ldb
= '2'
collection_id
= self.session.query(SKE, SKI) \
q1 filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint'))
.
for ske, ski in tqdm(q1.all()):
= ''
b = ''
om = ''
rc
= []
fragments for f in ski.has_part:
if f.type in ['title', 'abstract']:
fragments.append(f)
# USE AN LLM HERE INSTEAD OF A DEEP LEARNING CLASSIFER
for skf in sorted(fragments, key=lambda f: f.offset):
for s in self.sent_detector.tokenize(skf.content):
= classifier(skf.content)
m = lookup.get(m[0].get('label'))
l if l == 'BACKGROUND':
if len(b) > 0:
+= '\n'
b += s
b elif l == 'OBJECTIVE' or l == 'METHODS':
if len(om) > 0:
+= '\n'
om += s
om else:
if len(rc) > 0:
+= '\n'
rc += s
rc = ske.id+'.'+ski.type+'.'
skf_stem if len(b) > 0:
= ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
f_b type='background_sentences', offset=-1, length=len(b),
=skf_stem+'background', content=b)
nameself.session.add(f_b)
ski.has_part.append(f_b)= ski.id
f_b.part_of if len(om) > 0:
= ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
f_om type='objective_methods_sentences', offset=-1, length=len(om),
=skf_stem+'objective_methods', content=om)
nameself.session.add(f_om)
ski.has_part.append(f_om)= ski.id
f_om.part_of if len(rc) > 0:
= ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10],
f_rc type='results_conclusions_sentences', offset=-1, length=len(rc),
=skf_stem+'results_conclusions', content=rc)
nameself.session.add(f_rc)
ski.has_part.append(f_rc)= ski.id
f_rc.part_of self.session.flush()
self.session.commit()
self = ldb
= '2'
collection_id #self.session.rollback()
= self.session.query(SKF) \
q2 filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(or_(SKF.type == 'results_conclusions_sentences', \
.type == 'objective_methods_sentences', \
SKF.type == 'background_sentences'))
SKF.for skf in tqdm(q2.all()):
self.delete_fragment(skf.id)
self = ldb
= '2'
collection_id #self.session.rollback()
= self.session.query(SKE, SKF) \
q2 filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
.filter(SKC_HM.has_members_id == SKE.id) \
.filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id == SKI.id) \
.filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
.filter(SKI_HP.has_part_id == SKF.id) \
.filter(SKI.type == 'CitationRecord') \
.filter(SKF.type == 'objective_methods_sentences') \
.\
.order_by(desc(SKE.publication_date))
.order_by(SKF.name)
for ske, skf in tqdm(q2.all()):
print(skf)
Get full text copies of all the papers about CryoET
'input':'Get full text copies of all papers in the collection with id="2".'}) cb.agent_executor.invoke({
'5', 'EMPIAR CryoET Papers Tests', '4', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint']) ldb.create_new_collection_from_sample(
Analyze Collections
= ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
.filter(SKE_HR.has_representation_id==SKI.id)
.= pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])
df =['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique())).fillna(0) df.pivot_table(index
Survey + Run Classifications over Papers
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
= []
l = ldb.session.query(N, SKE) \
q filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
.
= []
output print(len(q.all()))
for n, ske in q.all():
id)
ldb.delete_note(n.print(len(q.all()))
= [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t 'collection_id': '5', 'classification_type':'cryoet_study_types', 'repeat_run':True}) t.run({
= [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t 'collection_id': '2', 'classification_type':'cryoet_study_types'}) t.run({
= []
l
ldb.session.rollback()= ldb.session.query(N, SKE) \
q == N.id) \
.join(NIA, NIA.Note_id id == NIA.is_about_id) \
.join(SKE, SKE.id == SKC_HM.has_members_id) \
.join(SKC_HM, SKE.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
.filter(SKC_HM.ScientificKnowledgeCollection_id == '5') \
.id, N.provenance)
.order_by(SKE.
= []
output for n, ske in q.all():
= json.loads(n.content)
tup 'doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
tup['year'] = ske.publication_date.year
tup['month'] = ske.publication_date.month
tup['ref'] = ske.content
tup[
output.append(tup)= pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df +'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
df.to_csv(loc df
= {'A': 'Viral Pathogens',
study_type_lookup 'B': "Mutated protein structure",
'C': 'Bacterial pathogens',
'D': 'Plant cells',
'E': 'Material science',
'F': 'Intracellular Transport Structure',
'G': 'Synapses or Vesicle Release',
'H': 'Other Intracellular Structure',
'I': 'Cellular Processes',
'J': 'Dynamics of molecular interactions',
'K': 'New CryoET imaging methods',
'L': 'New data analysis methods'}
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool = 20
step
for k in study_type_lookup.keys():
= df[df['cryoet_study_type_code'] == k]
df1 = [re.sub('http://doi.org/', 'doi:', r.doi) for i, r in df1.iterrows()]
dois_to_add
= '2.'+k
c_id = 'CryoET - ' + study_type_lookup[k]
c_name
= None
corpus = ldb.session.query(SKC).filter(SKC.id==c_id)
all_existing_query for c in all_existing_query.all():
= c
corpus if corpus is None:
= ScientificKnowledgeCollection(id=c_id,
corpus type='skem:ScientificKnowledgeCollection',
=c_name,
name=[])
has_members
ldb.session.add(corpus)
ldb.session.flush()
for doi in tqdm(dois_to_add):
= ldb.session.query(SKE) \
p filter(SKE.id==doi).first()
.if p is None:
continue
ldb.session.add(p)
corpus.has_members.append(p)
p.member_of.append(corpus)
ldb.session.flush() ldb.session.commit()
= [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
delete_collection_tool
for k in study_type_lookup.keys():
print(k)
'collection_id': '2.'+k}) delete_collection_tool.run({
Survey + Run Extractions over Papers
= [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractExtraction_OneDocAtATime_Tool)][0]
t 'collection_id': '5', 'extraction_type':'cryoet'}) t.run({
Tests + Checks
Agent tool selection + execution + interpretation
'input':'Hi who are you and what can you do?'}) cb.agent_executor.invoke({
Run MetaData Extraction Chain over listed papers
Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.
= ldb.session.query(SKE.id) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKC.id=='5')
.= [e.id for e in q.all()]
dois dois
# need to count tokens submitted to the server as a way of tracking usage.
= AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device='mps', token=os.environ['HF_API_KEY'])
tokenizer = "The methods section of the paper is as follows:"
prompt = tokenizer(prompt, return_tensors="pt")
tokenized print(len(tokenized["input_ids"][0]))
# How long are methods sections in the CryoET papers?
ldb.session.rollback()= ldb.session.query(SKE.id) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKC.id=='2') \
.filter(or_(SKE.type=='ScientificPrimaryResearchArticle', SKE.type=='ScientificPrimaryResearchPreprint'))
.
= tiktoken.encoding_for_model('gpt-3.5-turbo')
encoding
= []
tups for e in tqdm(q.all()):
= set()
item_types = None
item_type for i in ldb.list_items_for_expression(e.id):
type)
item_types.add(i.for i_type in item_types:
if i_type == 'CitationRecord':
continue
= i_type
item_type break
if item_type is None:
continue
= [f.content for f in ldb.list_fragments_for_paper(e.id, item_type, fragment_types=['section'])]
fragments = False
on_off = ''
text = ''
all_text for t in fragments:
+= t
all_text = t.split('\n')[0].lower()
l1 if 'method' in l1:
= True
on_off elif 'results' in l1 or 'discussion' in l1 or 'conclusion' in l1 or 'acknowledgements' in l1 \
or 'references' in l1 or 'supplementary' in l1 or 'appendix' in l1 or 'introduction' in l1 or 'abstract' in l1 or 'cited' in l1:
= False
on_off if on_off:
if len(text) > 0:
+= '\n\n'
text += t
text
= len(tokenizer(all_text, return_tensors="pt")['input_ids'][0])
all_text_length = len(tokenizer(text, return_tensors="pt")['input_ids'][0])
text_length 'doi':e.id, 'doc_length': all_text_length, 'method_length': text_length})
tups.append({= pd.DataFrame(tups)
df_length df_length
print(len(df_length[df_length['method_length']>8000]))
print(len(df_length[df_length['method_length']<8000]))
def plot_length_distribution(df_length):
=10)
plt.hist(df_length, bins'Length')
plt.xlabel('Frequency')
plt.ylabel('Distribution of Lengths')
plt.title(
plt.show()
'method_length']) plot_length_distribution(df_length[
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0] t2
for i, r in tqdm(df_length.iterrows()):
if len(df[df['doi']==r['doi']]) > 0:
continue
# Run the metadata extraction tool on the doi
try:
={'paper_id': r['doi'], 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
t2.run(tool_inputexcept Exception as e:
print(e)
continue
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= pd.DataFrame()
df2 for i, r in tqdm(df_length.iterrows()):
= set()
item_types = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test')
l if(len(l) == 0):
continue
= pd.concat([df2, pd.DataFrame(l)]) df2
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= pd.DataFrame()
df for i, r in tqdm(df_length.iterrows()):
if r['method_length'] < 8000:
= set()
item_types = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test_llama3')
l if(len(l) == 0):
continue
= pd.concat([df, pd.DataFrame(l)])
df df
'doi']=='doi:10.1101/2022.04.12.488077'] df[df[
'doi']=='doi:10.1101/2022.04.12.488077'] df2[df2[
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2 = '/Users/gully.burns/alhazen/em_tech/empiar/'
metadata_dir 'cryoet', metadata_dir)
t2.compile_answers('cryoet', metadata_dir)
t2.write_answers_as_notes(#sorted(list(set([doi for q in t2.examples for doi in t2.examples[q]])))
# Get the metadata extraction tool
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2
# Hack to get the path to the metadata directory as a string
#metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]
= '/Users/gully.burns/alhazen/em_tech/empiar/'
metadata_dir
# Compile the answers from the metadata directory
'cryoet', metadata_dir)
t2.compile_answers(
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= pd.DataFrame()
df for d in [d for d in dois]:
= set()
item_types = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
l = pd.concat([df, pd.DataFrame(l)])
df
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
= set()
item_types
# Skip if the doi is already in the database
if len(df)>0 and d in df.doi.unique():
continue
# Run the metadata extraction tool on the doi
={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
t2.run(tool_input
# Add the results to the dataframe
= t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
l2 = pd.concat([df, pd.DataFrame(l2)]) df
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2 = pd.DataFrame()
df_final for d in [d for d in dois]:
= set()
item_types = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
l = pd.concat([df_final, pd.DataFrame(l)])
df_final df_final
# Get the metadata extraction tool
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= []
l for d in [d for d in dois]:
= set()
item_types = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
pred1 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test_dbrx')
pred2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'gold')
gold if pred1 is None or pred2 is None or gold is None or \
len(pred1)==0 or len(pred2)==0 or len(gold)!=1:
continue
for k in gold[0]:
= gold[0][k]
g_case if g_case=='' or g_case is None:
continue
for j, p_case in enumerate(pred1):
= fuzz.ratio(str(g_case), str(p_case.get(k,''))) / 100.0
sim print(k, str(g_case), str(p_case.get(k,'')), sim)
# Get the metadata extraction tool
= [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2
= t2.report_metadata_extraction_for_collection('5', 'cryoet', 'test').set_index('doi')
df +'/'+db_name+'/reports/cryoet_metadata_gpt4.tsv', sep='\t') df.to_csv(loc
'5', loc+'/'+db_name+'/full_text_files.zip') ldb.create_zip_archive_of_full_text_files(
= ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
q3 filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'MetadataExtractionNote')
.= []
l for row in q3.all():
= row[0]
paper = row[1]
name # provenance = json.loads(row[2])
= json.loads(row[3])
result = {k:result[k] for k in result}
kv 'DOI'] = paper
kv['run'] = name
kv[
l.append(kv)# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
= pd.DataFrame(l).set_index(['DOI', 'run'])
df else:
= pd.DataFrame()
df df
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for row in q3.all():
= row[0]
d_id = ldb.session.query(SKE).filter(SKE.id==d_id).first()
e = []
notes_to_delete for n in ldb.read_notes_about_x(e):
id)
notes_to_delete.append(n.for n in notes_to_delete:
ldb.delete_note(n)
Protocol Modeling + Extraction
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = ChatOllama(model='stablelm-zephyr')
slm = ChatOllama(model='mixtral:instruct')
llm = ChatOpenAI(model='gpt-4-1106-preview')
llm2 = ChatOpenAI(model='gpt-3.5-turbo')
llm3 = ("This tool attempts to draw a protocol design from the description of a scientific paper.") d
= ProcotolEntitiesExtractionTool(db=ldb, llm=llm3, description=d)
t1 = t1.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
entities entities
= ProcotolProcessesExtractionTool(db=ldb, llm=llm3, description=d)
t2 = t2.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
processes 'data') processes.get(