Computing on X using online (incremental) algorithms • cellxgene.census

This tutorial showcases computing per-gene and per-cell statistics for a user-defined query using out-of-core operations, where incremental (online) algorithms are needed.

NOTE: when query results are small enough to fit in memory, it’s usually easier to use get_seurat() or get_single_cell_experiment().

Contents

Incremental mean calculation
Counting cells grouped by dataset and gene

Incremental mean calculation

Many statistics, such as marginal means, are easy to calculate incrementally. Let’s begin with a query on the X$raw sparse matrix of unnormalized read counts, which will return results in shards incrementally accumulate the read count for each gene, then divide by the cell count to get the mean reads per cell for each gene.

First define a query - in this case a slice over the obs axis for cells with a specific tissue & sex value, and all genes on the var axis. The query$X() method returns an iterator of results, each as an Arrow Table. Each table will contain the sparse X data and obs/var coordinates, using standard SOMA names:

soma_data - the X values (float32)
soma_dim_0 - the obs coordinate (int64)
soma_dim_1 - the var coordinate (int64)

Important: the X matrices are joined to var/obs axis DataFrames by an integer join “id” (aka soma_joinid). They are NOT positionally indexed, and any given cell or gene may have a soma_joinid of any value (e.g., a large integer). In other words, for any given X value, the soma_dim_0 corresponds to the soma_joinid in the obs dataframe, and the soma_dim_1 coordinate corresponds to the soma_joinid in the var dataframe.

For convenience, the query class includes a utility to simplify operations on query slices. query$indexer is an indexer used to wrap the output of query$X(), converting from soma_joinids to positional indexing in the query results. Positions are [0, N), where N are the number of results on the query for any given axis.

Key points:

it is expensive to query and read the results - so rather than make multiple passes over the data, read it once and perform multiple computations.
by default, data in the census is indexed by soma_joinid and not positionally.

library("tiledbsoma")
library("cellxgene.census")
census <- open_soma()

query <- census$get("census_data")$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain' && sex=='male'")
)

genes_df <- query$var(column_names = c("feature_id", "feature_name"))$concat()
genes_df <- as.data.frame(genes_df)
n_genes <- nrow(genes_df)

# accumulator vector (for each gene) for the total count over all cells in X("raw")
raw_sum_by_gene <- numeric(n_genes)
names(raw_sum_by_gene) <- genes_df$feature_id

# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
  table_part <- tables$read_next()
  # table_part is an Arrow table with the columns mentioned above. The result
  # order is not guaranteed!

  # table_part$soma_dim_1 is the var/gene soma_joinid. But note that these are
  # arbitrary int64 id's, and moreover each table_part may exhibit only a subset
  # of the values we'll see over all query results. query$indexer helps us map
  # any given soma_dim_1 values onto positions in query$var() (genes_df), that is
  # the union of all values we'll see.
  gene_indexes <- query$indexer$by_var(table_part$soma_dim_1)$as_vector()
  stopifnot(sum(gene_indexes >= n_genes) == 0)
  # sum(table_part) group by gene, yielding a numeric vector with the gene_index
  # in its names
  sum_part <- tapply(as.vector(table_part$soma_data), gene_indexes, sum)
  # update the accumulator vector
  which_genes <- as.integer(names(sum_part)) + 1 # nb: gene_indexes is zero-based
  stopifnot(sum(which_genes > n_genes) == 0)
  raw_sum_by_gene[which_genes] <- raw_sum_by_gene[which_genes] + sum_part
}

# Divide each sum by cell count to get mean reads per cell (for each gene),
# implicitly averaging in all zero entries in X even though they weren't included
# in the sparse query results.
genes_df$raw_mean <- raw_sum_by_gene / query$n_obs
genes_df
#>            feature_id  feature_name     raw_mean
#> 1  ENSMUSG00000021124         Vti1b 1.051981e+02
#> 2  ENSMUSG00000039377           Hlx 1.864338e+01
#> 3  ENSMUSG00000085604       Dhx58os 4.839911e-03
#> 4  ENSMUSG00000085125       Gm16070 4.343510e-03
#> 5  ENSMUSG00000029439        Sfswap 1.438792e+01
#> 6  ENSMUSG00000027792          Bche 8.145818e+00
#> 7  ENSMUSG00000090363        Gm3512 6.205014e-05
#> 8  ENSMUSG00000074896         Ifit3 6.227774e+01
#> 9  ENSMUSG00000030433          Sbk2 1.782907e-01
#> 10 ENSMUSG00000021972        Hmbox1 5.211322e+00
#> 11 ENSMUSG00000020752        Recql5 7.973649e+00
#> 12 ENSMUSG00000079555         Haus3 7.311037e+00
#> 13 ENSMUSG00000046178         Nxph1 2.097491e+01
#> 14 ENSMUSG00000022159         Rab2b 3.864993e+01
#> 15 ENSMUSG00000042371       Slc5a10 1.161589e+01
#> 16 ENSMUSG00000039270         Megf9 7.668363e+00
#> 17 ENSMUSG00000054641         Mmrn1 9.650865e-02
#> 18 ENSMUSG00000030443        Zfp583 2.867316e+00
#> 19 ENSMUSG00000038502         Ptov1 1.617562e+02
#> 20 ENSMUSG00000050671          Ism2 3.929842e-04
#> 21 ENSMUSG00000097170       Gm16982 1.232936e-01
#> 22 ENSMUSG00000034601 2700049A03Rik 7.181600e+00
#> 23 ENSMUSG00000040258         Nxph4 3.021842e-01
#> 24 ENSMUSG00000001014         Icam4 2.837966e-01
#> 25 ENSMUSG00000019761         Krt10 2.599611e+00
#> 26 ENSMUSG00000042198        Chchd7 2.078421e+01
#> 27 ENSMUSG00000037386         Rims2 1.047379e+01
#> 28 ENSMUSG00000086016 1700084C06Rik 1.447837e-04
#> 29 ENSMUSG00000098055       Gm26947 1.654670e-04
#> 30 ENSMUSG00000015165        Hnrnpl 2.534560e+01
#> 31 ENSMUSG00000056880         Gadl1 3.785265e-01
#> 32 ENSMUSG00000025175          Fn3k 1.647452e+01
#> 33 ENSMUSG00000028772       Zcchc17 5.578566e+01
#>  [ reached 'max' / getOption("max.print") -- omitted 52404 rows ]

Counting cells grouped by dataset and gene

The goal of this example is to count the number of cells with nonzero reads, grouped by gene and Census dataset_id. The result is a data frame with dataset, gene, and the number of cells with nonzero reads for that dataset and gene.

For this multi-factor aggregation, we’ll take advantage of dplyr routines instead of the lower-level vector indexer shown above. And for presentation purposes, we’ll limit the query to four genes, but this can be expanded to all genes easily.

library("dplyr")

query <- census$get("census_data")$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = SOMAAxisQuery$new(value_filter = "tissue=='brain'"),
  var_query = SOMAAxisQuery$new(value_filter = "feature_name %in% c('Malat1', 'Ptprd', 'Dlg2', 'Pcdh9')")
)

obs_tbl <- query$obs(column_names = c("soma_joinid", "dataset_id"))$concat()
obs_df <- data.frame(
  # materialize soma_joinid as character to avoid overflowing R 32-bit integer
  cell_id = as.character(obs_tbl$soma_joinid),
  dataset_id = obs_tbl$dataset_id$as_vector()
)
var_tbl <- query$var(column_names = c("soma_joinid", "feature_name"))$concat()
var_df <- data.frame(
  gene_id = as.character(var_tbl$soma_joinid),
  feature_name = var_tbl$feature_name$as_vector()
)

# accumulator for # cells by dataset & gene
n_cells_grouped <- data.frame(
  "dataset_id" = character(0),
  "gene_id" = character(0),
  "n_cells" = numeric(0)
)

# iterate through in-memory shards of query results
tables <- query$X("raw")$tables()
while (!tables$read_complete()) {
  table_part <- tables$read_next()

  # prepare a (dataset,gene,1) tuple for each entry in table_part
  n_cells_part <- data.frame(
    "cell_id" = as.character(table_part$soma_dim_0),
    "gene_id" = as.character(table_part$soma_dim_1),
    "n_cells" = 1
  )
  n_cells_part <- left_join(n_cells_part, obs_df, by = "cell_id")
  stopifnot(sum(is.null(n_cells_part$dataset_id)) == 0)

  # fold those into n_cells_grouped
  n_cells_grouped <- n_cells_part %>%
    select(-cell_id) %>%
    bind_rows(n_cells_grouped) %>%
    group_by(dataset_id, gene_id) %>%
    summarise(n_cells = sum(n_cells)) %>%
    ungroup()
}

# add gene names for display
n_cells_grouped <- left_join(n_cells_grouped, var_df, by = "gene_id")
stopifnot(sum(is.null(n_cells_grouped$feature_name)) == 0)
n_cells_grouped[c("dataset_id", "feature_name", "n_cells")]
#> # A tibble: 21 x 3
#>    dataset_id                           feature_name n_cells
#>    <chr>                                <chr>          <dbl>
#>  1 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Ptprd          79578
#>  2 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Dlg2           79513
#>  3 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Malat1         79667
#>  4 3bbb6cf9-72b9-41be-b568-656de6eb18b5 Pcdh9          79476
#>  5 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Ptprd            474
#>  6 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Dlg2              81
#>  7 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Malat1         12622
#>  8 58b01044-c5e5-4b0f-8a2d-6ebf951e01ff Pcdh9            125
#>  9 66ff82b4-9380-469c-bc4b-cfa08eacd325 Dlg2             856
#> 10 66ff82b4-9380-469c-bc4b-cfa08eacd325 Malat1          7102
#> # i 11 more rows

Don’t forget to close the census.

census$close()