Virtual Cell Landscaping Analysis

Using Alhazen to study and interrogate papers concerning the Virtual Cell data modeling work.

Note - this question is inherently driven by discussion and informal experience (as opposed to formal experimentation). So we would expect to

Preliminaries

from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database
from alhazen.utils.searchEngineUtils import *


from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

import local_resources.queries.vcm_key_papers as vcm_key_papers
import json

# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_IRI = aliased(ScientificKnowledgeExpressionIri)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_MO = aliased(ScientificKnowledgeExpressionMemberOf)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
NIA = aliased(NoteIsAbout)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

ALHAZEN_DB_NAME - the name of the PostGresQL database you are storing information into
LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.

os.environ['ALHAZEN_DB_NAME'] = 'virtual_cell'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Database has been backed up to /users/gully.burns/alhazen/alhazen_workbooks/backup2024-02-14-09-25-50.sql
Database has been dropped successfully !!

create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

100%|██████████| 310/310 [00:00<00:00, 2540.15it/s]

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
#llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)

cb = AlhazenAgent(llm, llm)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

AGENT TOOLS
    AddCollectionFromEPMCTool
    AddAuthorsToCollectionTool
    DescribeCollectionCompositionTool
    DeleteCollectionTool
    RetrieveFullTextTool
    RetrieveFullTextToolForACollection
    MetadataExtraction_EverythingEverywhere_Tool
    SimpleExtractionWithRAGTool
    PaperQAEmulationTool
    ProcotolExtractionTool
    CheckExpressionTool
    TitleAbstractClassifier_OneDocAtATime_Tool

with open(files(vcm_key_papers).joinpath('kp.json')) as f:
    kp = json.load(f)

for c  in kp['Single-cell transformers']:
    print(c.split('\t')[-1].strip())

10.1101/2024.01.25.577152
10.1101/2023.11.29.569320
10.1038/s41586-023-06139-9
10.1101/2023.11.28.568918
10.1038/s41592-024-02201-0
10.1038/s41467-023-35923-4
10.1145/3583780.3615061
10.1016/j.isci.2023.106536
10.48550/arXiv.2302.03038
10.1101/2023.05.29.542705
10.1101/2023.03.24.534055
10.48550/arXiv.2306.04371
https://openreview.net/forum?id=KMtM5ZHxct
10.1101/2023.09.26.559542
10.1101/2023.10.03.560734
10.1101/2024.02.13.580114
https://openreview.net/forum?id=QFm186CbBp
10.1101/2023.07.04.547619
10.3390/biom13040611
10.1093/bioinformatics/btad165
10.1093/bib/bbad195
10.1101/2022.11.20.517285
10.48550/arXiv.2210.14330
10.3389/fgene.2022.1038919
10.1038/s42256-022-00534-z
10.1093/bib/bbab573
10.1101/2020.02.05.935239