from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
import nltk
'punkt')
nltk.download(
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import json
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml
CryoET Tutorial
Introduction to CryoET
Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.
This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, CZII on developing the CryoET data portal, an open source repository for CryoET-based data.
Basics
Python Imports
Setting python imports, environment variables, and other crucial set up parameters here.
import pymde
= create_engine("postgresql+psycopg2://%s:%s@%s:5432/%s"%(os.environ['POSTGRES_USER'], os.environ['POSTGRES_PASSWORD'], os.environ['POSTGRES_HOST'], 'postgres'))
engine = engine.connect()
connection = connection.execute(text("SELECT datname FROM pg_database;"))
result = [row[0] for row in result if row[0] != 'postgres']
dbn
connection.close() dbn
Environment Variables
You must set the following environmental variables for this code:
LOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
Note that this notebook will build and use a database specified as cryoet_tutorial
, specified below
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
= os.environ['LOCAL_FILE_PATH']
loc = 'cryoet'
db_name
# Variable to prevent accidental deletion of the database or any records
= False OK_TO_DELETE
Setup utils, agents, and tools
This cell sets up a database engine (ldb
) and lists the available large-language models you can use.
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = lookup_chat_models()
llms_lookup print(llms_lookup.keys())
This cell initiates an AlhazenAgent
that you can use to run tools or execute commands over.
= llms_lookup.get('databricks_llama3')
llm
= AlhazenAgent(llm, llm, db_name=db_name)
cb print('AGENT TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
Building the database
Scripts to Build / Delete the database
If you need to restore a deleted database from backup, use the following shell commands:
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
This command will delete your existing database (but will also store a copy).
if OK_TO_DELETE:
=True) drop_ceifns_database(db_name, backupFirst
This command will backup your current database
if OK_TO_DELETE:
= datetime.now()
current_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
formatted_date_time = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
backup_path backup_ceifns_database(db_name, backup_path)
This command will create a new, fresh, empty copy of your database.
create_ceifns_database(db_name)
'POSTGRES_HOST'] os.environ[
list_databases()
Build CEIFNS database from queries
Add a collection of all CryoET papers based on a query
This runs a query on European PMC for terms + synonyms related to Cryo Electron Tomography
= '''
cryoet_query ("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
"Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or
"cryo-electron tomography" OR "cryo-et" OR cryoet)
'''
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool ={'id': '1',
addEMPCCollection_tool.run(tool_input'name': 'CryoET Papers',
'query': cryoet_query})
= []
l = ldb.session.query(SKE)
q = []
output for ske in q.all():
l.append(ske)print(len(l))
Adding Machine Learning also from a query
= '''
ml_query ("Cryoelectron Tomography" OR "Cryo Electron Tomography" OR "Cryo-Electron Tomography" OR
"Cryo-ET" OR "CryoET" OR "Cryoelectron Tomography" OR "cryo electron tomography" or
"cryo-electron tomography" OR "cryo-et" OR cryoet ) AND
("Machine Learning" OR "Artificial Intelligence" OR "Deep Learning" OR "Neural Networks")
'''
= [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool ={'id': '2',
addEMPCCollection_tool.run(tool_input'name': 'Machine Learning in CryoET',
'query': ml_query,
'full_text': False})
Creates a new collection of randomly sampled papers to showcase full-text download capability
'3', 'CryoET Papers Tests', '1', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint']) ldb.create_new_collection_from_sample(
Analyze Collections
Survey + Run Classifications over Papers
This invoke the following classification process on the paper (defined in the prompt definition in ./local_resources/prompts/tiab_prompts
):
- A - Structural descriptions of Viral Pathogens (such as HIV, Influenza, SARS-CoV-2, etc.)
- B - Studies of mutated protein structures associated with disease (such as Alzheimer’s, Parkinson’s, etc.)
- C - Structural studies of bacterial pathogens (such as E. coli, Salmonella, etc.)
- D - Structural studies of plant cells
- E - Structural studies of material science of non-biological samples
- F - Structural studies of transporters or transport mechanisms within cells, studies involving the cytoskeleton or active transport processes.
- G - Structural studies of synapses or other mechansism of releasing vesicles over the plasma membrane
- H - Structural studies of any other organelle or structured component of a cell.
- I - Studies of dynamic biological processes at a cellular level (such as cell division, cell migration, etc.)
- J - Studies of dynamics of molecular interactions within a cell.
- K - Development of new CryoET imaging methods (including grid preparation techniques, such as lift-out).
- L - Development of new data analysis methods (including machine learning, segmentation, point-picking, object recognition, or reconstruction).
= [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t 'collection_id': '3', 'classification_type':'cryoet_study_types', 'repeat_run':True}) t.run({
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
if OK_TO_DELETE:
= []
l = ldb.session.query(N, SKE) \
q filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
.
= []
output print(len(q.all()))
for n, ske in q.all():
id)
ldb.delete_note(n.print(len(q.all()))
Runs a query over the notes extracted and saved to the database to show the zero-shot document classifications based on the titles + abstracts
= []
l = ldb.session.query(N, SKE) \
q filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
.id)
.order_by(SKE.
= []
output for n, ske in q.all():
= json.loads(n.content)
tup 'prov'] = n.name
tup['doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
tup['year'] = ske.publication_date.year
tup['month'] = ske.publication_date.month
tup['ref'] = ske.content
tup[
output.append(tup)= pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df +'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
df.to_csv(loc df
Run MetaData Extraction Chain over listed papers
Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset.
Get full text copies of all the papers about CryoET
'input':'Get full text copies of all papers in the collection with id="3".'}) cb.agent_executor.invoke({
Identify which papers are in the sampled collection through their dois.
= ldb.session.query(SKE.id) \
q filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
.filter(SKC_HM.has_members_id==SKE.id) \
.filter(SKC.id=='2')
.= [e.id for e in q.all()]
dois dois
Iterate over those dois and extract 15 metadata variables based on the questions shown in ./local_resources/prompt_elements/metadata_extraction.yaml
# Get the metadata extraction tool
= [t for t in cb.tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
t2
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
= pd.DataFrame()
df for d in [d for d in dois]:
= set()
item_types = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
l = pd.concat([df, pd.DataFrame(l)])
df
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
= set()
item_types
# Skip if the doi is already in the database
if len(df)>0 and d in df.doi.unique():
continue
# Run the metadata extraction tool on the doi
={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test'})
t2.run(tool_input
# Add the results to the dataframe
= t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
l2 = pd.concat([df, pd.DataFrame(l2)])
df
df
'2', loc+'/'+db_name+'/full_text_files.zip') ldb.create_zip_archive_of_full_text_files(
= ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
q3 filter(N.id == NIA.Note_id) \
.filter(NIA.is_about_id == SKE.id) \
.filter(N.type == 'MetadataExtractionNote')
.= []
l for row in q3.all():
= row[0]
paper = row[1]
name # provenance = json.loads(row[2])
= json.loads(row[3])
result = {k:result[k] for k in result}
kv 'DOI'] = paper
kv['run'] = name
kv[
l.append(kv)# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
= pd.DataFrame(l).set_index(['DOI', 'run'])
df else:
= pd.DataFrame()
df df
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
if OK_TO_DELETE:
for row in q3.all():
= row[0]
d_id = ldb.session.query(SKE).filter(SKE.id==d_id).first()
e = []
notes_to_delete for n in ldb.read_notes_about_x(e):
id)
notes_to_delete.append(n.for n in notes_to_delete:
ldb.delete_note(n)