Source code for czbenchmarks.datasets.types

from dataclasses import dataclass
from enum import Enum
from typing import Type, Union, Dict
import numpy as np
import pandas as pd
import anndata as ad
from omegaconf import OmegaConf


[docs] class Organism(Enum): HUMAN = ("homo_sapiens", "ENSG") MOUSE = ("mus_musculus", "ENSMUSG") TROPICAL_CLAWED_FROG = ("xenopus_tropicalis", "ENSXETG") AFRICAN_CLAWED_FROG = ("xenopus_laevis", "ENSXLAG") ZEBRAFISH = ("danio_rerio", "ENSDARG") MOUSE_LEMUR = ("microcebus_murinus", "ENSMICG") WILD_BOAR = ("sus_scrofa", "ENSSSCG") CRAB_EATING_MACAQUE = ("macaca_fascicularis", "ENSMFAG") RHESUS_MACAQUE = ("macaca_mulatta", "ENSMMUG") PLATYPUS = ("ornithorhynchus_anatinus", "ENSOANG") OPOSSUM = ("monodelphis_domestica", "ENSMODG") GORILLA = ("gorilla_gorilla", "ENSGGOG") CHIMPANZEE = ("pan_troglodytes", "ENSPTRG") MARMOSET = ("callithrix_jacchus", "ENSCJAG") CHICKEN = ("gallus_gallus", "ENSGALG") RABBIT = ("oryctolagus_cuniculus", "ENSOCUG") FRUIT_FLY = ("drosophila_melanogaster", "FBgn") RAT = ("rattus_norvegicus", "ENSRNOG") NAKED_MOLE_RAT = ("heterocephalus_glaber", "ENSHGLG") CAENORHABDITIS_ELEGANS = ("caenorhabditis_elegans", "WBGene") YEAST = ("saccharomyces_cerevisiae", "") MALARIA_PARASITE = ("plasmodium_falciparum", "PF3D7") SEA_LAMPREY = ("petromyzon_marinus", "ENSPMAG") FRESHWATER_SPONGE = ("spongilla_lacustris", "ENSLPGG") CORAL = ("stylophora_pistillata", "LOC") SEA_URCHIN = ("lytechinus_variegatus", "") # Mixed prefixes: LOC and GeneID # Todo: add other organisms def __init__(self, name: str, prefix: str): self._value_ = (name, prefix) # This is handled automatically by Enum
[docs] def __str__(self): return self.value[0] # Access the name from the tuple
[docs] def __repr__(self): return self.value[0] # Access the name from the tuple
@property def name(self): return self.value[0] @property def prefix(self): return self.value[1]
# Register Organism resolver if not OmegaConf.has_resolver("organism"): # Required for dataset test cases OmegaConf.register_new_resolver("organism", lambda name: getattr(Organism, name))
[docs] @dataclass(frozen=True) class DataTypeSpec: """Specification for a data type in the system""" name: str dtype: Type description: str is_input: bool = True
[docs] class DataType(Enum): # Input types METADATA = DataTypeSpec( name="metadata", dtype=pd.DataFrame, description="Sample metadata" ) ANNDATA = DataTypeSpec( name="anndata", dtype=ad.AnnData, description="AnnData object containing expression data", ) ORGANISM = DataTypeSpec( name="organism", dtype=Organism, description="Organism type (e.g. human, mouse)", ) # Output types EMBEDDING = DataTypeSpec( name="embedding", dtype=np.ndarray, description="Learned cell embeddings", is_input=False, ) CONDITION_KEY = DataTypeSpec( name="condition_key", dtype=str, description="Condition key for perturbation data", is_input=True, ) SPLIT_KEY = DataTypeSpec( name="split_key", dtype=str, description="Train, test, val, split key for perturbation data", is_input=True, ) PERTURBATION_PRED = DataTypeSpec( name="perturbation", dtype=tuple[str, pd.DataFrame], description="Tuple of (condition_str, Predicted perturbation effects)", is_input=False, ) PERTURBATION_TRUTH = DataTypeSpec( name="perturbation_truth", dtype=Dict[str, pd.DataFrame], description="Truth perturbation data", is_input=True, ) @property def spec(self) -> DataTypeSpec: return self.value @property def dtype(self) -> Type: return self.value.dtype @property def description(self) -> str: return self.value.description @property def is_input(self) -> bool: return self.value.is_input @property def is_output(self) -> bool: return not self.value.is_input
[docs] def __hash__(self): return hash(self.name)
[docs] def __eq__(self, other): if isinstance(other, str): return self.name == other return super().__eq__(other)
[docs] def __str__(self): return self.name
[docs] def __repr__(self): return self.name
DataValue = Union[pd.DataFrame, ad.AnnData, np.ndarray, Organism]