CryoET Tutorial

Developing a simple tutorial to provide a walkthrough for users attempting to use Alhazen for the first time. This is based on analysis of the Cryo Electron Tomography literature and tools we have developed to analyze that data.

Introduction to CryoET

Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.

This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, CZII on developing the CryoET data portal, an open source repository for CryoET-based data.

Basics

Python Imports

Setting python imports, environment variables, and other crucial set up parameters here.

from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database, list_databases

from alhazen.utils.searchEngineUtils import *

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

import nltk
nltk.download('punkt')

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import json
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml
import pymde
engine = create_engine("postgresql+psycopg2://%s:%s@%s:5432/%s"%(os.environ['POSTGRES_USER'], os.environ['POSTGRES_PASSWORD'], os.environ['POSTGRES_HOST'], 'postgres'))
connection = engine.connect()
result = connection.execute(text("SELECT datname FROM pg_database;"))
dbn = [row[0] for row in result if row[0] != 'postgres']
connection.close()
dbn

Environment Variables

You must set the following environmental variables for this code:

  • LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.

Note that this notebook will build and use a database specified as cryoet_tutorial, specified below

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])    

loc = os.environ['LOCAL_FILE_PATH']
db_name = 'cryoet'

# Variable to prevent accidental deletion of the database or any records
OK_TO_DELETE = False

Setup utils, agents, and tools

This cell sets up a database engine (ldb) and lists the available large-language models you can use.

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llms_lookup = lookup_chat_models()
print(llms_lookup.keys())

This cell initiates an AlhazenAgent that you can use to run tools or execute commands over.

llm = llms_lookup.get('databricks_llama3')

cb = AlhazenAgent(llm, llm, db_name=db_name)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

Building the database

Scripts to Build / Delete the database

If you need to restore a deleted database from backup, use the following shell commands:

$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql

This command will delete your existing database (but will also store a copy).

if OK_TO_DELETE:
    drop_ceifns_database(db_name, backupFirst=True)

This command will backup your current database

if OK_TO_DELETE:
    current_date_time = datetime.now()
    formatted_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
    backup_path = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
    backup_ceifns_database(db_name, backup_path)

This command will create a new, fresh, empty copy of your database.

create_ceifns_database(db_name)
os.environ['POSTGRES_HOST']
list_databases()

Build CEIFNS database from queries

Add a collection of all CryoET papers based on a query

This runs a query on European PMC for terms + synonyms related to Cryo Electron Tomography

cryoet_query = '''
("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
    "Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or 
    "cryo-electron tomography" OR "cryo-et" OR cryoet)
'''
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '1', 
                                       'name': 'CryoET Papers', 
                                       'query': cryoet_query})
l = []
q = ldb.session.query(SKE) 
output = []        
for ske in q.all():
    l.append(ske)
print(len(l))

Adding Machine Learning also from a query

ml_query = '''
("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
    "Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or 
    "cryo-electron tomography" OR "cryo-et" OR cryoet ) AND 
("Machine Learning" OR "Artificial Intelligence" OR "Deep Learning" OR "Neural Networks")
'''
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '2', 
                                       'name': 'Machine Learning in CryoET', 
                                       'query': ml_query, 
                                       'full_text': False})

Creates a new collection of randomly sampled papers to showcase full-text download capability

ldb.create_new_collection_from_sample('3', 'CryoET Papers Tests', '1', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint'])

Analyze Collections

Survey + Run Classifications over Papers

This invoke the following classification process on the paper (defined in the prompt definition in ./local_resources/prompts/tiab_prompts):

  • A - Structural descriptions of Viral Pathogens (such as HIV, Influenza, SARS-CoV-2, etc.)
  • B - Studies of mutated protein structures associated with disease (such as Alzheimer’s, Parkinson’s, etc.)
  • C - Structural studies of bacterial pathogens (such as E. coli, Salmonella, etc.)
  • D - Structural studies of plant cells
  • E - Structural studies of material science of non-biological samples
  • F - Structural studies of transporters or transport mechanisms within cells, studies involving the cytoskeleton or active transport processes.
  • G - Structural studies of synapses or other mechansism of releasing vesicles over the plasma membrane
  • H - Structural studies of any other organelle or structured component of a cell.
  • I - Studies of dynamic biological processes at a cellular level (such as cell division, cell migration, etc.)
  • J - Studies of dynamics of molecular interactions within a cell.
  • K - Development of new CryoET imaging methods (including grid preparation techniques, such as lift-out).
  • L - Development of new data analysis methods (including machine learning, segmentation, point-picking, object recognition, or reconstruction).
t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '3', 'classification_type':'cryoet_study_types', 'repeat_run':True})
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
if OK_TO_DELETE:        
    l = []
    q = ldb.session.query(N, SKE) \
            .filter(N.id == NIA.Note_id) \
            .filter(NIA.is_about_id == SKE.id) \
            .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \

    output = []        
    print(len(q.all()))
    for n, ske in q.all():
        ldb.delete_note(n.id)    
    print(len(q.all()))

Runs a query over the notes extracted and saved to the database to show the zero-shot document classifications based on the titles + abstracts

l = []
q = ldb.session.query(N, SKE) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
        .order_by(SKE.id)

output = []        
for n, ske in q.all():
        tup = json.loads(n.content)
        tup['prov'] = n.name
        tup['doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
        tup['year'] = ske.publication_date.year
        tup['month'] = ske.publication_date.month
        tup['ref'] = ske.content
        output.append(tup)
df = pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df.to_csv(loc+'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
df

Run MetaData Extraction Chain over listed papers

Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.

Get full text copies of all the papers about CryoET

cb.agent_executor.invoke({'input':'Get full text copies of all papers in the collection with id="3".'})

Identify which papers are in the sampled collection through their dois.

q = ldb.session.query(SKE.id) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKC.id=='2')  
dois = [e.id for e in q.all()]
dois

Iterate over those dois and extract 15 metadata variables based on the questions shown in ./local_resources/prompt_elements/metadata_extraction.yaml

# Get the metadata extraction tool
t2 = [t for t in cb.tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for d in [d for d in dois]:
    item_types = set()
    l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l)]) 
     
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
    item_types = set()

    # Skip if the doi is already in the database
    if len(df)>0 and d in df.doi.unique():
        continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test'})

    # Add the results to the dataframe    
    l2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l2)]) 

df
ldb.create_zip_archive_of_full_text_files('2', loc+'/'+db_name+'/full_text_files.zip')
q3 = ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'MetadataExtractionNote') 
l = []
for row in q3.all():
    paper = row[0]
    name = row[1]
#    provenance = json.loads(row[2])
    result = json.loads(row[3])
    kv = {k:result[k] for k in result}
    kv['DOI'] = paper
    kv['run'] = name
    l.append(kv)
# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
    df = pd.DataFrame(l).set_index(['DOI', 'run'])
else: 
    df = pd.DataFrame()
df
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
if OK_TO_DELETE:
    for row in q3.all():
        d_id = row[0]
        e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
        notes_to_delete = []
        for n in ldb.read_notes_about_x(e):
            notes_to_delete.append(n.id)
        for n in notes_to_delete:
            ldb.delete_note(n)