from alhazen.apps.chat import AlhazenAgentChatBot
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import *
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml
from alhazen.utils.searchEngineUtils import load_paper_from_openalex, read_references_from_openalex
from pyalex import config, Works, Work
= "gully.burns@chanzuckerberg.com"
config.email
import requests
import os
CellXGene Datasets
Collaboration work to enable curation of data into the CellXGene system
The goal of this work is to survey the literature in RNAseq analysis as it happens. We intend to use this approach to contact researchers publishing these valuable datasets so that we can include them in the CellXGene data portal.
Preliminaries
# Using Aliases like this massively simplifies the use of SQLAlchemy
= aliased(InformationResource)
IR
= aliased(ScientificKnowledgeCollection)
SKC = aliased(ScientificKnowledgeCollectionHasMembers)
SKC_HM = aliased(ScientificKnowledgeExpression)
SKE = aliased(ScientificKnowledgeExpressionXref)
SKE_XREF = aliased(ScientificKnowledgeExpressionIri)
SKE_IRI = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_HR = aliased(ScientificKnowledgeExpressionMemberOf)
SKE_MO = aliased(ScientificKnowledgeItem)
SKI = aliased(ScientificKnowledgeItemHasPart)
SKI_HP = aliased(ScientificKnowledgeFragment)
SKF
= aliased(Note)
N = aliased(NoteIsAbout)
NIA = aliased(ScientificKnowledgeCollectionHasNotes)
SKC_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKE_HN = aliased(ScientificKnowledgeItemHasNotes)
SKI_HN = aliased(ScientificKnowledgeFragmentHasNotes) SKF_HN
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
'ALHAZEN_DB_NAME'] = 'sc_sequencing'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'
os.environ[
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
if os.environ.get('ALHAZEN_DB_NAME') is None:
raise Exception('Which database do you want to use for this application?')
= os.environ['ALHAZEN_DB_NAME']
db_name
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
= os.environ['LOCAL_FILE_PATH'] loc
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Database has been backed up to /users/gully.burns/alhazen/alhazen_workbooks/backup2024-02-14-09-25-50.sql
Database has been dropped successfully !!
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
100%|██████████| 310/310 [00:00<00:00, 2594.82it/s]
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = get_langchain_chatmodel(model_type=MODEL_TYPE.Ollama, llm_name='mixtral:instruction')
llm = AlhazenAgentChatBot()
cb
print('AVAILABLE TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
AVAILABLE TOOLS
AddCollectionFromEPMCTool
AddAuthorsToCollectionTool
DescribeCollectionCompositionTool
DeleteCollectionTool
RetrieveFullTextTool
RetrieveFullTextToolForACollection
MetadataExtraction_EverythingEverywhere_Tool
SimpleExtractionWithRAGTool
PaperQAEmulationTool
ProcotolExtractionTool
CheckExpressionTool
IntrospectionTool
# Define the API endpoint URL
= '10.7150/ijbs.82191'
doi
= load_paper_from_openalex(doi)
e
= 'doi:10.7150/ijbs.82191'
doi = read_references_from_openalex(doi)
referenced_works for r in referenced_works:
= load_paper_from_openalex(r) e