CryoET

Methods to extract metadata and study the structure of scientific protocols based on all available online data and knowledge.

Introduction to CryoET

Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.

This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, CZII on developing the CryoET data portal, an open source repository for CryoET-based data.

Basics

Python Imports

Setting python imports, environment variables, and other crucial set up parameters here.

from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database

from alhazen.utils.searchEngineUtils import *


from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml

import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from rapidfuzz import fuzz

# Plot the distribution of the lengths of the methods sections 
import seaborn as sns  
import matplotlib.pyplot as plt
import tiktoken
import transformers
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, AutoModel, AutoTokenizer
import torch
import local_resources.queries.em_tech as em_tech_queries
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
import json
from jsonpath_ng import jsonpath, parse
from langchain_community.chat_models.openai import ChatOpenAI

Environment Variables

Remember to set environmental variables for this code:

ALHAZEN_DB_NAME - the name of the PostGresQL database you are storing information into
LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])    

loc = os.environ['LOCAL_FILE_PATH']
db_name = 'em_tech'

Setup utils, agents, and tools

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llms_lookup = lookup_chat_models()
print(llms_lookup.keys())

llm_gpt4_1106 = ChatOpenAI(model='gpt-4-1106-preview') 
llm_gpt35 = ChatOpenAI(model='gpt-3.5-turbo')

#llm = llms_lookup.get('databricks_llama3')

cb = AlhazenAgent(llm_gpt35, llm_gpt35, db_name=db_name)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

test_tk = MetadataExtractionToolkit(db=ldb, llm=llm_gpt35)
print('\nTESTING TOOLS')
for t in test_tk.get_tools():
    print('\t'+type(t).__name__)

Set Evaluation Dataset

These are cases directly taken from *.yaml files that

Identify cases from the CZI CryoET Portal.

dois = {10000: ['10.1101/2022.04.12.488077'], 
        10001: ['10.1101/2022.04.12.488077'], 
        10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'], 
        10004: ['10.1101/2023.04.28.538734'], 
        10005: ['10.1038/s41594-022-00861-0'], 
        10006: ['10.1038/s41586-020-2665-2'], 
        10007: [], 
        10008: ['10.1038/s41586-022-04971-z'], 
        10009: ['10.1126/science.abm6704'], 
        10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}
dois_flattened = [doi for doi_list in dois.values() for doi in doi_list]
dois_flattened = list(set(dois_flattened))
dois_flattened

Retrieve Gold Standard experimental metadata from EMPIAR database.

Download the entire database to a local file from: https://www.ebi.ac.uk/emdb/search/database:EMPIAR
Save the location in a temporary variable: empiar_metadata_path
Process the downloaded file for (A) EMDB ids, (B) DOI values for publications.

# local_path to the file downloaded from the EMPIAR search results: 
# https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true
# download the file and save it to a local path
url = "https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true"
empiar_metadata_path = loc+db_name+'/EMPIAR_search_results.json'
response = requests.get(url, stream=True)
with open(empiar_metadata_path, "wb") as handle:
    for data in response.iter_content():
        handle.write(data)

with open(empiar_metadata_path, 'r') as f:
    empiar_metadata = json.load(f)
empiar_dataset_ids = list(empiar_metadata.keys())
d = {}
for empiar_id in empiar_dataset_ids:
    d[empiar_id] = {'dois':[], 'emd_ids': []}
    for citation in empiar_metadata.get(empiar_id, {}).get('citation', []):
        if citation.get('doi') is not None:
            d[empiar_id]['dois'].append(citation.get('doi'))
    for emd_id in empiar_metadata.get(empiar_id, {}).get('cross_references'):
        d[empiar_id]['emd_ids'].append(emd_id.get('name'))    

def get_nested(data, *args):
    if args and data:
        element  = args[0]
        if element:
            value = data.get(element)
            return value if len(args) == 1 else get_nested(value, *args[1:])

# get metadata from the EMDB entries for each case
metadlist = []

# jsonpath expressions to identify specific metadata from the EMDB entries
# focus mainly on the specimen preparation (grids, buffers, vitrification, etc.)
sd_jp = 'structure_determination_list.structure_determination[*]'
sample_preparation_type_jp = parse(sd_jp + '.method')
agg_state_jp = parse(sd_jp + '.aggregation_state')
specprep_list_jp = sd_jp + '.specimen_preparation_list.specimen_preparation[*]'
buffer_jp = parse(specprep_list_jp + '.buffer.ph') 
grid_model_jp = parse(specprep_list_jp + '.grid.model')
grid_material_jp = parse(specprep_list_jp + '.grid.material') 
grid_mesh_jp = parse(specprep_list_jp + '.grid.mesh')
grid_support_topology_jp = parse(specprep_list_jp + '.grid.support_film[*].film_topology')
grid_pretreatment_jp = parse(specprep_list_jp + '.grid.pretreatment.type_')
grid_vitrification_cryogen_jp = parse(specprep_list_jp + '.vitrification.cryogen_name')
grid_vit_ctemp_jp = specprep_list_jp + '.vitrification.chamber_temperature.'
grid_vit_chumid_jp = specprep_list_jp + '.vitrification.chamber_humidity'

jp_method = parse('structure_determination_list.structure_determination[*]')
i = 0
for k,v in d.items():
    #i += 1
    #if i > 10:
    #    break
    print(k,v)
    for emd_id in v['emd_ids']:
        emd_exp = requests.get('https://www.ebi.ac.uk/emdb/api/entry/experiment/'+emd_id)
        if emd_exp.status_code == 200:
            emd = emd_exp.json()
            sample_preparation_type = ', '.join([m.value for m in sample_preparation_type_jp.find(emd)])
            agg_state = ', '.join([m.value for m in agg_state_jp.find(emd)])
            buffer = ', '.join([str(m.value) for m in buffer_jp.find(emd)])
            grid_model = ', '.join([m.value for m in grid_model_jp.find(emd)])
            grid_material = ', '.join([m.value for m in grid_material_jp.find(emd)])
            grid_mesh = ', '.join([str(m.value) for m in grid_mesh_jp.find(emd)])
            grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
            grid_pretreatment = ', '.join([m.value for m in grid_pretreatment_jp.find(emd)])
            grid_vitrification_cryogen = ', '.join([m.value for m in grid_vitrification_cryogen_jp.find(emd)])
            grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])

            grid_vit_ctemp_units = [m.value for m in parse(grid_vit_ctemp_jp+'.units').find(emd)]
            grid_vit_ctemp_values = [str(m.value) for m in parse(grid_vit_ctemp_jp+'.valueOf_').find(emd)]
            grid_vit_ctemp = ','.join([t[0]+' '+t[1] for t in zip(grid_vit_ctemp_values, grid_vit_ctemp_units)])

            grid_vit_chumid_units = [m.value for m in parse(grid_vit_chumid_jp+'.units').find(emd)]
            grid_vit_chumid_values = [str(m.value) for m in parse(grid_vit_chumid_jp+'.valueOf_').find(emd)]
            grid_vit_chumid = ', '.join([t[0]+' '+t[1] for t in zip(grid_vit_chumid_values, grid_vit_chumid_units)])

            for doi in v['dois']:
                metadlist.append({'doi':doi, 
                                  'emd_id': emd_id, 
                                  'sample_preparation_type': sample_preparation_type, 
                                  'agg_state': agg_state, 
                                  'sample_preparation_buffer_ph': buffer, 
                                  'grid_model': grid_model, 
                                  'grid_material': grid_material, 
                                  'grid_mesh': grid_mesh, 
                                  'grid_support_topology': grid_support_topology, 
                                  'grid_pretreatment': grid_pretreatment, 
                                  'grid_vitrification_cryogen': grid_vitrification_cryogen, 
                                  'grid_vit_ctemp': grid_vit_ctemp, 
                                  'grid_vit_chumid': grid_vit_chumid})
        else:
            print('ERROR: ', emd_exp.status_code)

empiar_df = pd.DataFrame(metadlist)
empiar_df.to_csv(loc+db_name+'/empiar_metadata.tsv', sep='\t', index=False)
empiar_dois = sorted(empiar_df['doi'].unique())
empiar_df

Load the EMPIAR data from disk

This is from local directory that we just created

empiar_df = pd.read_csv(loc+db_name+'/empiar/empiar_metadata.tsv', sep='\t')
empiar_dois = sorted(empiar_df['doi'].unique())
empiar_df

Building the database

Scripts to Build / Delete the database

If you need to restore a deleted database from backup, use the following shell commands:

$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql

drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

loc = os.environ['LOCAL_FILE_PATH']
current_date_time = datetime.now()
formatted_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
backup_path = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
backup_ceifns_database(db_name, backup_path)

create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

Build CEIFNS database from queries

Add a collection based on EMPIAR papers

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20
for start_i in range(0, len(empiar_dois), step):
    query = ' OR '.join(['doi:"'+empiar_dois[i]+'"' for i in range(start_i, start_i+step)])
    addEMPCCollection_tool.run({'id': '3', 'name':'EMPIAR Papers', 'query':query, 'full_text':True})

def join_set(x):
    out = ''
    try:
        out = ' '.join(set(x))
    except:
        pass
    return out

# identify papers that we have full text for in EMPIAR
q = ldb.session.query(SKE.id) \
        .distinct() \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) \
        .filter(SKC.id == '3') \
        .filter(or_(SKI.type == 'JATSFullText', SKI.type == 'PDFFullText')) 
dois_to_include = [d[0][4:] for d in q.all()]    

empiar_gold_standard = []
for i, row in empiar_df.iterrows():
    if row.doi in dois_to_include:
        empiar_gold_standard.append( row.to_dict() )
empiar_gold_standard_df = pd.DataFrame(empiar_gold_standard)

empiar_gs_df = empiar_gold_standard_df.groupby(['doi']).agg({'sample_preparation_type': join_set, 
                                                             'agg_state': join_set, 
                                                             'sample_preparation_buffer_ph': join_set, 
                                                             'grid_model': join_set, 
                                                             'grid_material': join_set, 
                                                             'grid_mesh': join_set, 
                                                             'grid_support_topology': join_set, 
                                                             'grid_pretreatment': join_set, 
                                                             'grid_vitrification_cryogen': join_set, 
                                                             'grid_vit_ctemp': join_set, 
                                                             'grid_vit_chumid': join_set}).reset_index()
empiar_gs_df

Import papers from DOIs pertaining to CryoET-Portal records `10000-10010`

The CryoET Data portal system is based on submitted data to our curation team, accompanied by papers referenced by DOIs. Each dataset is assigned an ID value associated with DOIs.

# use the EMPCSearchTool to run a query for the dois mentioned
query = ' OR '.join(['doi:"'+d+'"' for d_id in dois for d in dois[d_id] ])
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '0', 'name':'CryoET Portal (10000-10010)', 'query':query, 'full_text':True})

Extend Database to include all CryoET papers

cols_to_include = ['ID', 'CORPUS_NAME', 'QUERY']
df = pd.read_csv(files(em_tech_queries).joinpath('EM_Methods.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])
df

qt = QueryTranslator(df.sort_values('ID'), 'ID', 'QUERY', 'CORPUS_NAME')
(corpus_ids, epmc_queries) = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS', 'METHODS'])
corpus_names = df['CORPUS_NAME']

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
    if id != 2:
        continue
    addEMPCCollection_tool.run(tool_input={'id': id, 'name':name, 'query':query, 'full_text':False})

Combine + Sample CryoET + EMPIAR Collections to provide a test set of papers.

ldb.create_new_collection_from_intersection('4', 'EMPIAR CryoET Papers', '2', '3')

Adding Machine Learning

ml_query = '''
("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
    "Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or 
    "cryo-electron tomography" OR "cryo-et" OR cryoet ) AND 
("Machine Learning" OR "Artificial Intelligence" OR "Deep Learning" OR "Neural Networks")
'''
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '6', 
                                       'name': 'Machine Learning in CryoET', 
                                       'query': ml_query, 
                                       'full_text': False})

delCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
delCollection_tool.run(tool_input={'collection_id': '6'})

Break up TIAB of papers into sentences + classify by discourse

NOTE - HUGGING FACE MODELS DO NOT WORK WELL ON THIS CORPUS. (NOT SURPRISINGLY - THEY WERE TRAINED ON MEDICAL PAPERS WHERE THE DIFFERENT SECTIONS OF THE PAPER WERE EXPLICITLY LABELED)

USE LLMS TO DO THE EXTRACTION - GPT3.5?

# Get the metadata extraction tool
t2 = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
t2.run(tool_input={'collection_id': '5', 'run_label': 'dev'})

j = '''{
"Background": "Eps15-homology domain containing proteins (EHDs) are eukaryotic, dynamin-related ATPases involved in cellular membrane trafficking. They oligomerize on membranes into filaments that induce membrane tubulation. While EHD crystal structures in open and closed conformations were previously reported, little structural information is available for the membrane-bound oligomeric form. Consequently, mechanistic insights into the membrane remodeling mechanism have remained sparse.",
"Objectives_Methods": "Here, by using cryo-electron tomography and subtomogram averaging, we determined structures of nucleotide-bound EHD4 filaments on membrane tubes of various diameters at an average resolution of 7.6 Å.",
"Results_Conclusions": "Assembly of EHD4 is mediated via interfaces in the G-domain and the helical domain. The oligomerized EHD4 structure resembles the closed conformation, where the tips of the helical domains protrude into the membrane. The variation in filament geometry and tube radius suggests a spontaneous filament curvature of approximately 1/70 nm<sup>-1</sup>. Combining the available structural and functional data, we suggest a model for EHD-mediated membrane remodeling."
}'''
json.loads(j)

# Get the metadata extraction tool
models = ['databricks_dbrx']
for m in models:
    llm = llms_lookup.get(m)
    cb = AlhazenAgent(llm, llm, db_name=db_name)
    t2 = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractDiscourseMappingTool)][0]
    t2.run(tool_input={'collection_id': '2', 'run_label': m, 'repeat_run': False})

to_remove = ["doi:10.1101/2024.03.04.583254", "doi:10.1101/2023.11.21.567712",
            "doi:10.3791/6515", "doi:10.1101/2023.07.28.550950",
            "doi:10.1093/micmic/ozad067.483", "doi:10.1007/978-1-0716-2639-9_20",
            "doi:10.1016/j.yjsbx.2022.100076", "doi:10.1016/j.xpro.2022.101658",
            "doi:10.1016/j.cell.2022.06.034", "doi:10.1093/plphys/kiab449",
            "doi:10.1073/pnas.2118020118", "doi:10.3791/62886",
            "doi:10.20944/preprints202105.0098.v1", "doi:10.1016/bs.mcb.2020.12.009",
            "doi:10.1007/978-1-0716-0966-8_1", "doi:10.1007/978-1-0716-0966-8_2",
            "doi:10.21769/bioprotoc.3768", "doi:10.1371/journal.ppat.1008883",
            "doi:10.1101/2020.05.19.104828", "doi:10.1073/pnas.1916331116",
            "doi:10.1042/bst20170351_cor", "doi:10.1038/s41594-018-0043-7",
            "doi:10.1007/978-1-4939-8585-2_4", "doi:10.1007/s41048-017-0040-0",
            "doi:10.1007/978-1-4939-6927-2_20", "doi:10.1016/j.str.2015.03.008",
            "doi:10.1007/978-1-62703-227-8_4", "doi:10.1016/b978-0-12-397945-2.00017-2",
            "doi:10.1016/j.jmb.2010.10.021", "doi:10.1186/1757-5036-3-6",
            "doi:10.1016/j.jmb.2008.03.014", "doi:10.1007/978-1-59745-294-6_20"]

for d in to_remove:
    q = """
    SELECT DISTINCT n.id FROM langchain_pg_embedding as emb, "Note" as n
    WHERE emb.cmetadata->>'n_type' = 'TiAbMappingNote__Discourse' AND
        emb.cmetadata->>'about_id' = '{}' AND 
        emb.cmetadata->>'discourse_type' = 'ResultsConclusions' AND 
        emb.cmetadata->>'n_id' = n.id;""".format(d)
    for row in ldb.session.execute(text(q)).all():
        ldb.delete_note(row[0], commit_this=True)

ldb.session.rollback()
exp_q = ldb.session.query(SKE) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.type=='CitationRecord') \
        .filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
        .order_by(desc(SKE.publication_date))

count = 0
for e in tqdm(exp_q.all()):
    q = ldb.session.query(N) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == e.id) \
        .filter(N.type =='TiAbMappingNote__Discourse')
    for n in q.all():
        dmap = json.loads(n.content) 
        if 'Objectives_Methods' in dmap.keys():
            print('beep')
            #new_dmap = {'Background': dmap.get('Background'), 'ObjectivesMethods': dmap.get('Objectives_Methods'), 'ResultsConclusions': dmap.get('Results_Conclusions')}
            #n.content = json.dumps(new_dmap, indent=4)
            #ldb.session.flush()
#ldb.session.commit()

ldb.session.rollback()
exp_q = ldb.session.query(SKE) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == str('2')) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.type=='CitationRecord') \
        .filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) \
        .order_by(desc(SKE.publication_date))

texts = []
metadatas = []

count = 0
for e in tqdm(exp_q.all()):
    q = ldb.session.query(N) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == e.id) \
        .filter(N.type =='TiAbMappingNote__Discourse')


    n = q.first()
    dmap = json.loads(n.content)
    '''Runs through the list of expressions, generates embeddings, and stores them in the database'''

    for dtype in ['Background', 'ObjectivesMethods', 'ResultsConclusions']:
        t = dmap.get(dtype)
        if t is None:
            continue
        texts.append(t)
        metadatas.append({'about_id': e.id, \
                        'about_type': 'ScientificKnowledgeExpression', \
                        'n_id': n.id, \
                        'n_type': 'TiAbMappingNote__Discourse', \
                        'discourse_type': dtype})

docs = []
for t,m in zip(texts, metadatas):
    docs.append(Document(page_content=t, metadata=m))
    
db = PGVector.from_documents(
    embedding=ldb.embed_model,
    documents=docs,
    collection_name="Note"
)

model_path = '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", 
                                          truncation=True, 
                                          max_length=512)
labels = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
lookup = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
model = AutoModel.from_pretrained(model_path)
model.eval()

classifier = pipeline("text-classification", 
                      model = model_path, 
                      tokenizer=tokenizer, 
                      truncation=True,
                      batch_size=8,
                      device='mps')

self = ldb
collection_id = '2'

q1 = self.session.query(SKE, SKI) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) 

for ske, ski in tqdm(q1.all()):
    b = ''
    om = ''
    rc = ''  

    fragments = []
    for f in ski.has_part:
      if f.type in ['title', 'abstract']:
        fragments.append(f)

    # USE AN LLM HERE INSTEAD OF A DEEP LEARNING CLASSIFER


    for skf in sorted(fragments, key=lambda f: f.offset):
        for s in self.sent_detector.tokenize(skf.content):
            m = classifier(skf.content)
            l = lookup.get(m[0].get('label'))
            if l == 'BACKGROUND':
                if len(b) > 0:
                    b += '\n'
                b += s
            elif l == 'OBJECTIVE' or l == 'METHODS':
                if len(om) > 0:
                    om += '\n'
                om += s
            else: 
                if len(rc) > 0:
                    rc += '\n'
                rc += s
    skf_stem = ske.id+'.'+ski.type+'.'
    if len(b) > 0:
        f_b = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='background_sentences', offset=-1, length=len(b),
                name=skf_stem+'background', content=b)
        self.session.add(f_b)
        ski.has_part.append(f_b)
        f_b.part_of = ski.id    
    if len(om) > 0:
        f_om = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='objective_methods_sentences', offset=-1, length=len(om),
                name=skf_stem+'objective_methods', content=om)
        self.session.add(f_om)
        ski.has_part.append(f_om)
        f_om.part_of = ski.id
    if len(rc) > 0:
        f_rc = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='results_conclusions_sentences', offset=-1, length=len(rc),
                name=skf_stem+'results_conclusions', content=rc)
        self.session.add(f_rc)
        ski.has_part.append(f_rc)
        f_rc.part_of = ski.id
    self.session.flush()
self.session.commit()

self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKF) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(or_(SKF.type == 'results_conclusions_sentences', \
                SKF.type == 'objective_methods_sentences', \
                SKF.type == 'background_sentences'))
for skf in tqdm(q2.all()):
    self.delete_fragment(skf.id)

self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKE, SKF) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(SKF.type == 'objective_methods_sentences') \
        .order_by(desc(SKE.publication_date)) \
        .order_by(SKF.name)

for ske, skf in tqdm(q2.all()):
    print(skf)

Get full text copies of all the papers about CryoET

cb.agent_executor.invoke({'input':'Get full text copies of all papers in the collection with id="2".'})

ldb.create_new_collection_from_sample('5', 'EMPIAR CryoET Papers Tests', '4', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint'])

Analyze Collections

q = ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) 
df = pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])
df.pivot_table(index=['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique())).fillna(0)

Survey + Run Classifications over Papers

# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
l = []
q = ldb.session.query(N, SKE) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \

output = []        
print(len(q.all()))
for n, ske in q.all():
    ldb.delete_note(n.id)    
print(len(q.all()))

t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'classification_type':'cryoet_study_types', 'repeat_run':True})

t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '2', 'classification_type':'cryoet_study_types'})

l = []
ldb.session.rollback()
q = ldb.session.query(N, SKE) \
        .join(NIA, NIA.Note_id == N.id) \
        .join(SKE, SKE.id == NIA.is_about_id) \
        .join(SKC_HM, SKE.id == SKC_HM.has_members_id) \
        .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == '5') \
        .order_by(SKE.id, N.provenance)

output = []        
for n, ske in q.all():
        tup = json.loads(n.content)
        tup['doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
        tup['year'] = ske.publication_date.year
        tup['month'] = ske.publication_date.month
        tup['ref'] = ske.content
        output.append(tup)
df = pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df.to_csv(loc+'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
df

study_type_lookup = {'A': 'Viral Pathogens', 
                     'B': "Mutated protein structure", 
                     'C': 'Bacterial pathogens', 
                     'D': 'Plant cells', 
                     'E': 'Material science', 
                     'F': 'Intracellular Transport Structure', 
                     'G': 'Synapses or Vesicle Release', 
                     'H': 'Other Intracellular Structure', 
                     'I': 'Cellular Processes',
                     'J': 'Dynamics of molecular interactions',    
                     'K': 'New CryoET imaging methods', 
                     'L': 'New data analysis methods'}

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20

for k in study_type_lookup.keys():
    df1 = df[df['cryoet_study_type_code'] == k]
    dois_to_add = [re.sub('http://doi.org/', 'doi:', r.doi) for i, r in df1.iterrows()]

    c_id = '2.'+k
    c_name = 'CryoET - ' + study_type_lookup[k]

    corpus = None
    all_existing_query = ldb.session.query(SKC).filter(SKC.id==c_id)
    for c in all_existing_query.all():
      corpus = c
    if corpus is None:      
      corpus = ScientificKnowledgeCollection(id=c_id,
                                           type='skem:ScientificKnowledgeCollection',
                                           name=c_name,
                                           has_members=[])
    ldb.session.add(corpus)
    ldb.session.flush()

    for doi in tqdm(dois_to_add):
        p = ldb.session.query(SKE) \
            .filter(SKE.id==doi).first()
        if p is None:
          continue
        ldb.session.add(p)
        corpus.has_members.append(p)
        p.member_of.append(corpus)
        ldb.session.flush()
ldb.session.commit()

delete_collection_tool = [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
 
for k in study_type_lookup.keys():
    print(k)
    delete_collection_tool.run({'collection_id': '2.'+k})

Survey + Run Extractions over Papers

t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractExtraction_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'extraction_type':'cryoet'})

Tests + Checks

Agent tool selection + execution + interpretation

cb.agent_executor.invoke({'input':'Hi who are you and what can you do?'})

Run MetaData Extraction Chain over listed papers

Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.

q = ldb.session.query(SKE.id) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKC.id=='5')  
dois = [e.id for e in q.all()]
dois

# need to count tokens submitted to the server as a way of tracking usage. 

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device='mps', token=os.environ['HF_API_KEY'])
prompt = "The methods section of the paper is as follows:"
tokenized = tokenizer(prompt, return_tensors="pt")
print(len(tokenized["input_ids"][0]))

# How long are methods sections in the CryoET papers?
ldb.session.rollback()
q = ldb.session.query(SKE.id) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKC.id=='2') \
        .filter(or_(SKE.type=='ScientificPrimaryResearchArticle', SKE.type=='ScientificPrimaryResearchPreprint'))

encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

tups = []
for e in tqdm(q.all()):
    item_types = set()
    item_type = None
    for i in ldb.list_items_for_expression(e.id):
        item_types.add(i.type)
    for i_type in item_types:
        if i_type == 'CitationRecord':
            continue
        item_type = i_type
        break
    if item_type is None:
        continue

    fragments = [f.content for f in ldb.list_fragments_for_paper(e.id, item_type, fragment_types=['section'])]
    on_off = False
    text = ''
    all_text = ''
    for t in fragments:
        all_text += t
        l1 = t.split('\n')[0].lower()
        if 'method' in l1:
            on_off = True
        elif 'results' in l1 or 'discussion' in l1 or 'conclusion' in l1 or 'acknowledgements' in l1 \
                or 'references' in l1 or 'supplementary' in l1 or 'appendix' in l1 or 'introduction' in l1 or 'abstract' in l1 or 'cited' in l1:
            on_off = False
        if on_off:
            if len(text) > 0:
                text += '\n\n'
            text += t

    all_text_length = len(tokenizer(all_text, return_tensors="pt")['input_ids'][0])
    text_length = len(tokenizer(text, return_tensors="pt")['input_ids'][0])
    tups.append({'doi':e.id, 'doc_length': all_text_length, 'method_length': text_length})
df_length = pd.DataFrame(tups)
df_length

print(len(df_length[df_length['method_length']>8000]))
print(len(df_length[df_length['method_length']<8000]))


def plot_length_distribution(df_length):
    plt.hist(df_length, bins=10)
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths')
    plt.show()

plot_length_distribution(df_length['method_length'])

t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

for i, r in tqdm(df_length.iterrows()):
    if len(df[df['doi']==r['doi']]) > 0:
        continue
    # Run the metadata extraction tool on the doi
    try: 
        t2.run(tool_input={'paper_id': r['doi'], 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
    except Exception as e:
        print(e)
        continue

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df2 = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
        item_types = set()
        l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test')
        if(len(l) == 0):
            continue
        df2 = pd.concat([df2, pd.DataFrame(l)])

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
    if r['method_length'] < 8000:
        item_types = set()
        l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test_llama3')
        if(len(l) == 0):
            continue
        df = pd.concat([df, pd.DataFrame(l)]) 
df

df[df['doi']=='doi:10.1101/2022.04.12.488077']

df2[df2['doi']=='doi:10.1101/2022.04.12.488077']

t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'
t2.compile_answers('cryoet', metadata_dir)
t2.write_answers_as_notes('cryoet', metadata_dir)
#sorted(list(set([doi for q in t2.examples for doi in t2.examples[q]])))

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

# Hack to get the path to the metadata directory as a string
#metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'

# Compile the answers from the metadata directory
t2.compile_answers('cryoet', metadata_dir)

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for d in [d for d in dois]:
    item_types = set()
    l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l)]) 
     
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
    item_types = set()

    # Skip if the doi is already in the database
    if len(df)>0 and d in df.doi.unique():
        continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})

    # Add the results to the dataframe    
    l2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l2)])

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
df_final = pd.DataFrame()
for d in [d for d in dois]:
    item_types = set()
    l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df_final = pd.concat([df_final, pd.DataFrame(l)]) 
df_final

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
l = []
for d in [d for d in dois]:
    item_types = set()
    pred1 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    pred2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test_dbrx')
    gold = t2.read_metadata_extraction_notes(d, 'cryoet', 'gold') 
    if pred1 is None or pred2 is None or gold is None or \
            len(pred1)==0 or len(pred2)==0 or len(gold)!=1:
        continue
    for k in gold[0]:
        g_case = gold[0][k]
        if g_case=='' or g_case is None:
            continue    
        for j, p_case in enumerate(pred1):
            sim = fuzz.ratio(str(g_case), str(p_case.get(k,''))) / 100.0
            print(k, str(g_case), str(p_case.get(k,'')), sim)

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

df = t2.report_metadata_extraction_for_collection('5', 'cryoet', 'test').set_index('doi')
df.to_csv(loc+'/'+db_name+'/reports/cryoet_metadata_gpt4.tsv', sep='\t')

ldb.create_zip_archive_of_full_text_files('5', loc+'/'+db_name+'/full_text_files.zip')

q3 = ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'MetadataExtractionNote') 
l = []
for row in q3.all():
    paper = row[0]
    name = row[1]
#    provenance = json.loads(row[2])
    result = json.loads(row[3])
    kv = {k:result[k] for k in result}
    kv['DOI'] = paper
    kv['run'] = name
    l.append(kv)
# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
    df = pd.DataFrame(l).set_index(['DOI', 'run'])
else: 
    df = pd.DataFrame()
df

# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for row in q3.all():
    d_id = row[0]
    e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
    notes_to_delete = []
    for n in ldb.read_notes_about_x(e):
        notes_to_delete.append(n.id)
    for n in notes_to_delete:
        ldb.delete_note(n)

Protocol Modeling + Extraction

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
slm = ChatOllama(model='stablelm-zephyr') 
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm3 = ChatOpenAI(model='gpt-3.5-turbo') 
d = ("This tool attempts to draw a protocol design from the description of a scientific paper.")

t1 = ProcotolEntitiesExtractionTool(db=ldb, llm=llm3, description=d)
entities = t1.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
entities

t2 = ProcotolProcessesExtractionTool(db=ldb, llm=llm3, description=d)
processes = t2.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
processes.get('data')