from alhazen.apps.chat import AlhazenAgentChatBot
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool, MetadataExtractionWithRAGTool
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml
Pathogen_Landscaping
Building a database indexing the literature across all bacteria, viruses, fungi, and the diseases that they cause.
Note - this question is inherently driven by discussion and informal experience (as opposed to formal experimentation). So we would expect to
Preliminaries
# Using Aliases like this massively simplifies the use of SQLAlchemy
= aliased(InformationResource)
IR
= aliased(ScientificKnowledgeCollection)
SKC = aliased(ScientificKnowledgeCollectionHasMembers)
SKC_HM = aliased(ScientificKnowledgeExpression)
SKE = aliased(ScientificKnowledgeExpressionXref)
SKE_XREF = aliased(ScientificKnowledgeExpressionIri)
SKE_IRI = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_HR = aliased(ScientificKnowledgeExpressionMemberOf)
SKE_MO = aliased(ScientificKnowledgeItem)
SKI = aliased(ScientificKnowledgeItemHasPart)
SKI_HP = aliased(ScientificKnowledgeFragment)
SKF
= aliased(Note)
N = aliased(NoteIsAbout)
NIA = aliased(ScientificKnowledgeCollectionHasNotes)
SKC_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKE_HN = aliased(ScientificKnowledgeItemHasNotes)
SKI_HN = aliased(ScientificKnowledgeFragmentHasNotes) SKF_HN
Remember to set environmental variables for this code:
ALHAZEN_DB_NAME
- the name of the PostGresQL database you are storing information intoLOCAL_FILE_PATH
- the location on disk where you save temporary files, downloaded models or other data.
'ALHAZEN_DB_NAME'] = 'machine_learning_and_biology'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'
os.environ[
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
'LOCAL_FILE_PATH'])
os.makedirs(os.environ[
if os.environ.get('ALHAZEN_DB_NAME') is None:
raise Exception('Which database do you want to use for this application?')
= os.environ['ALHAZEN_DB_NAME']
db_name
if os.environ.get('LOCAL_FILE_PATH') is None:
raise Exception('Where are you storing your local literature database?')
= os.environ['LOCAL_FILE_PATH'] loc
'ALHAZEN_DB_NAME']) drop_ceifns_database(os.environ[
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Database has been backed up to /users/gully.burns/alhazen/alhazen_workbooks/backup2024-02-14-09-25-50.sql
Database has been dropped successfully !!
'ALHAZEN_DB_NAME']) create_ceifns_database(os.environ[
100%|██████████| 311/311 [00:00<00:00, 2249.83it/s]
= Ceifns_LiteratureDb(loc=loc, name=db_name)
ldb = get_langchain_chatmodel(model_type=MODEL_TYPE.Ollama, llm_name='mixtral:instruction')
llm = AlhazenAgentChatBot()
cb
print('AVAILABLE TOOLS')
for t in cb.tk.get_tools():
print('\t'+type(t).__name__)
AVAILABLE TOOLS
AddCollectionFromEPMCTool
AddAuthorsToCollectionTool
DescribeCollectionCompositionTool
DeleteCollectionTool
RetrieveFullTextTool
RetrieveFullTextToolForACollection
MetadataExtractionTool
SimpleExtractionWithRAGTool
PaperQAEmulationTool
CheckExpressionTool
IntrospectionTool