Source code for czbenchmarks.datasets.dataset

from abc import ABC, abstractmethod
from io import StringIO
from pathlib import Path
from typing import Any, Optional

from .types import Organism


[docs] class Dataset(ABC): """ Abstract base class for datasets. Each concrete Dataset subclass is responsible for extracting and managing the data required for a specific type of task from the provided input file. Subclasses should define instance variables to store these task-specific data items, which can then be accessed as object attributes or written to files for downstream use. All Dataset instances must specify an `Organism` enum value to indicate the organism from which the data was derived. Subclasses must implement: - `load_data`: Loads the dataset from the input file and populates relevant instance variables. - `store_task_inputs`: Stores the extracted task-specific inputs in files or directories as needed. - `_validate`: Validates dataset-specific constraints and requirements. Attributes: path: The path to the dataset file. task_inputs_dir: The directory where task-specific input files are stored. organism: The organism from which the data was derived. """ path: Path task_inputs_dir: Path organism: Organism def __init__( self, dataset_type_name: str, path: str | Path, organism: Organism, task_inputs_dir: Optional[Path] = None, **kwargs: Any, ): """ Initialize a Dataset instance. Args: dataset_type_name (str): Name of the dataset type (used for directory naming). path (str | Path): Path to the dataset file. organism (Organism): Enum value indicating the organism. task_inputs_dir (Optional[Path]): Directory for storing task-specific inputs. kwargs (Any): Additional attributes for the dataset. Raises: ValueError: If the dataset path does not exist. """ self.path = Path(path) if not self.path.exists(): raise ValueError("Dataset path does not exist") self.task_inputs_dir = task_inputs_dir or ( Path(f"{self.path.with_suffix('')}_task_inputs") / dataset_type_name.lower() ) self.organism = organism self.kwargs = kwargs for key, value in kwargs.items(): setattr(self, key, value)
[docs] @abstractmethod def load_data(self) -> None: """ Load the dataset from its source file into memory. Subclasses must implement this method to load their specific data format. For example, SingleCellDataset loads an AnnData object from an h5ad file. The loaded data should be stored as instance attributes that can be accessed by other methods. """
[docs] @abstractmethod def store_task_inputs(self) -> Path: """ Store the task-specific inputs extracted from the dataset. Subclasses must implement this method to store task-specific files in a subdirectory of the dataset path. The subdirectory name is determined by the subclass. Returns: Path: The path to the directory storing the task input files. """ pass
def _store_task_input(self, path: Path | str, data: StringIO) -> None: """ Store a single task input data stream to a file. Creates the necessary subdirectories if they do not exist. Args: path (Path | str): Relative path to the task input file. data (StringIO): Data to write to the file. """ output_dir = self.task_inputs_dir / Path(path).parent output_dir.mkdir(parents=True, exist_ok=True) output_file = self.task_inputs_dir / path output_file.write_text(data) @abstractmethod def _validate(self) -> None: """ Perform dataset-specific validation. Subclasses must implement this method to validate dataset-specific constraints. """ pass # FIXME VALIDATION: move to validation class?
[docs] def validate(self) -> None: """ Performs general validation checks, such as ensuring the organism is a valid `Organism` enum value. Calls `_validate` for subclass-specific validation. Raises: ValueError: If validation fails. """ if not isinstance(self.organism, Organism): raise ValueError("Organism is not a valid Organism enum") self._validate()