African Microscopy

A key effort at CZI is supporting communities of scientists across the world.

Preliminaries

from alhazen.aliases import *
from alhazen.apps.chat import  AlhazenAgentChatBot
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool, MetadataExtractionWithRAGTool 
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

Remember to set environmental variables for this code:

  • ALHAZEN_DB_NAME - the name of the PostGresQL database you are storing information into
  • LOCAL_FILE_PATH - the location on disk where you save temporary files, downloaded models or other data.
os.environ['ALHAZEN_DB_NAME'] = 'african_microscopy'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])
    
if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])
Database has been backed up to /users/gully.burns/alhazen/em_tech/backup2024-02-12-13-24-55.sql
Database has been dropped successfully !!
create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])
100%|██████████| 311/311 [00:00<00:00, 4023.60it/s]
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = get_langchain_chatmodel(model_type=MODEL_TYPE.Ollama, llm_name='mixtral:instruction')
cb = AlhazenAgentChatBot()

print('AVAILABLE TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)
AVAILABLE TOOLS
    AddCollectionFromEPMCTool
    DescribeCollectionCompositionTool
    DeleteCollectionTool
    RetrieveFullTextTool
    RetrieveFullTextToolForACollection
    MetadataExtractionTool
    SimpleExtractionWithRAGTool
    PaperQAEmulationTool
    CheckExpressionTool
    IntrospectionTool