CellXGene Datasets

Collaboration work to enable curation of data into the CellXGene system

The goal of this work is to survey the literature in RNAseq analysis as it happens. We intend to use this approach to contact researchers publishing these valuable datasets so that we can include them in the CellXGene data portal.

Preliminaries

from alhazen.apps.chat import  AlhazenAgentChatBot
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

from alhazen.utils.searchEngineUtils import load_paper_from_openalex, read_references_from_openalex 
from pyalex import config, Works, Work
config.email = "gully.burns@chanzuckerberg.com"

import requests
import os

# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_IRI = aliased(ScientificKnowledgeExpressionIri)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_MO = aliased(ScientificKnowledgeExpressionMemberOf)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
NIA = aliased(NoteIsAbout)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

ALHAZEN_DB_NAME - the name of the PostGresQL database you are storing information into
LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.

os.environ['ALHAZEN_DB_NAME'] = 'sc_sequencing'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Database has been backed up to /users/gully.burns/alhazen/alhazen_workbooks/backup2024-02-14-09-25-50.sql
Database has been dropped successfully !!

create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

100%|██████████| 310/310 [00:00<00:00, 2594.82it/s]

ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = get_langchain_chatmodel(model_type=MODEL_TYPE.Ollama, llm_name='mixtral:instruction')
cb = AlhazenAgentChatBot()

print('AVAILABLE TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

AVAILABLE TOOLS
    AddCollectionFromEPMCTool
    AddAuthorsToCollectionTool
    DescribeCollectionCompositionTool
    DeleteCollectionTool
    RetrieveFullTextTool
    RetrieveFullTextToolForACollection
    MetadataExtraction_EverythingEverywhere_Tool
    SimpleExtractionWithRAGTool
    PaperQAEmulationTool
    ProcotolExtractionTool
    CheckExpressionTool
    IntrospectionTool

# Define the API endpoint URL
doi = '10.7150/ijbs.82191'

e = load_paper_from_openalex(doi)

doi = 'doi:10.7150/ijbs.82191'
referenced_works = read_references_from_openalex(doi)
for r in referenced_works:
    e = load_paper_from_openalex(r)