Skip to contents

This tutorial demonstrates basic use of the census_datasets data frame that contains metadata of the Census source datasets. This metadata can be joined to the cell metadata data frame (obs) via the column dataset_id.

Contents

  1. Fetching the datasets table.
  2. Fetching the expression data from a single dataset.
  3. Downloading the original source H5AD file of a dataset.

Fetching the datasets table

Each Census contains a top-level data frame itemizing the datasets contained therein. You can read this SOMADataFrame into an Arrow Table:

library("cellxgene.census")
census <- open_soma()
census_datasets <- census$get("census_info")$get("datasets")$read()$concat()
print(census_datasets)
#> Table
#> 651 rows x 9 columns
#> $soma_joinid <int64 not null>
#> $collection_id <large_string not null>
#> $collection_name <large_string not null>
#> $collection_doi <large_string not null>
#> $dataset_id <large_string not null>
#> $dataset_version_id <large_string not null>
#> $dataset_title <large_string not null>
#> $dataset_h5ad_path <large_string not null>
#> $dataset_total_cell_count <int64 not null>

and then an R data frame:

census_datasets <- as.data.frame(census_datasets)
print(census_datasets[, c(
  "dataset_id",
  "dataset_title",
  "dataset_total_cell_count"
)])
#>                              dataset_id
#> 1  2bdd3a2c-2ff4-4314-adf3-8a06b797a33a
#> 2  f5b0810c-1664-4a62-ad06-be1d9964aa8b
#> 3  e4ddac12-f48f-4455-8e8d-c2a48a683437
#> 4  e2808a6e-e2ea-41b9-b38c-4a08f1677f02
#> 5  d01c9dff-abd1-4825-bf30-2eb2ba74597e
#> 6  c3aa4f95-7a18-4a7d-8dd8-ca324d714363
#> 7  be401db3-d732-408a-b0c4-71af0458b8ab
#> 8  a5d5c529-8a1f-40b5-bda3-35208970070d
#> 9  9c63201d-bfd9-41a8-bbbc-18d947556f3d
#> 10 93cb76aa-a84b-4a92-8e6c-66a914e26d4c
#> 11 8d1dd010-5cbc-43fb-83f8-e0de8e8517da
#> 12 716a4acc-919e-4326-9672-ebe06ede84e6
#> 13 5bdc423a-59e6-457d-aa01-debd2c9c564f
#> 14 5346f9c6-755e-4336-94cc-38706ec00c2f
#> 15 015c230d-650c-4527-870d-8a805849a382
#> 16 d567b692-c374-4628-a508-8008f6778f22
#> 17 cf83c98a-3791-4537-bbde-a719f6d73c13
#> 18 738942eb-ac72-44ff-a64b-8943b5ecd8d9
#> 19 f8d8b443-bca6-4c3c-9042-669dfb7f8030
#> 20 f5be4b96-f5a3-4c3d-84ac-6f69daf744d5
#> 21 dea1aa78-c0a2-413f-b375-f91cce49e4d0
#> 22 92161459-9103-4379-ae34-73a38eee1d1d
#> 23 5829c7ba-697f-418e-8b98-d605b192dc48
#> 24 4dd1cd23-fc4d-4fd1-9709-602540f3ca6f
#> 25 2856d06c-0ff9-4e01-bfc9-202b74d0b60f
#> 26 251b1a7e-d050-4486-8d50-4c2619eb0f46
#> 27 07760522-707a-4a1c-8891-dbd1226d6b27
#> 28 9fcb0b73-c734-40a5-be9c-ace7eea401c9
#> 29 1a38e762-2465-418f-b81c-6a4bce261c34
#> 30 f16a8f4d-bc97-43c5-a2f6-bbda952e4c5c
#> 31 94c41723-b2c4-4b59-a49a-64c9b851903e
#> 32 6ceeaa86-9ceb-4582-b390-6d4dd6ff0572
#> 33 9a64bf99-ebe5-4276-93a8-bee9dff1cd47
#> 34 fc0ceb80-d2d9-47c1-9d78-b0e45c64c500
#> 35 d0ea3ec4-0f3b-4649-9146-1c0b5f303a55
#> 36 b8920ef5-7d22-497b-abca-a7a9eb76d79a
#> 37 b1d37bbd-9ae4-4404-b2f9-f2fe66750e4e
#> 38 a4e89c26-e8d4-4471-9b06-16a1405880f0
#> 39 a190b2e9-3796-4785-9a2f-013e2a9a43e6
#> 40 9ff9f9ba-016b-4cbb-8899-45dc20860b8b
#> 41 9940f951-3dc0-4579-bbb2-2392786e59a3
#> 42 74d584f0-74fc-482e-b944-e76f29c1ab85
#> 43 6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
#> 44 6cda07c7-5d7a-41ba-9799-5bb73da25a60
#> 45 646e3e87-e46b-4b12-85b5-8d8589e26088
#> 46 6437bc9c-16cb-46c8-8f79-9a7384a0212a
#> 47 58c43cc2-e00e-43c4-94eb-8501369264e1
#> 48 53bc5729-6202-4351-bc99-1f36139e9dc4
#> 49 44c83972-e5d2-4858-ac58-2df9f4bf564b
#> 50 2ecc72f8-085f-4e86-8692-771f316c54f6
#> 51 2e5a9b5d-d31b-4e9f-a179-d5d70ba459fb
#> 52 1c9f5c6b-73da-4d17-95de-df080ffe0df1
#> 53 100c6145-7b0e-4ba6-81c1-ffebed0d1ac4
#> 54 0ed60482-a34f-4268-b576-d69cc30210f6
#> 55 0eccaf0c-19d2-4900-9962-899378adf8be
#> 56 04c94a7d-1133-42c9-bb48-c697bd302a8d
#> 57 0374f03c-62e2-4859-8a14-acb00b0627d5
#> 58 03181d87-4769-41e7-8c39-d9a81835f0d2
#> 59 f171db61-e57e-4535-a06a-35d8b6ef8f2b
#> 60 ecf2e08e-2032-4a9e-b466-b65b395f4a02
#> 61 74cff64f-9da9-4b2a-9b3b-8a04a1598040
#> 62 5af90777-6760-4003-9dba-8f945fec6fdf
#> 63 bd65a70f-b274-4133-b9dd-0d1431b6af34
#> 64 ff45e623-7f5f-46e3-b47d-56be0341f66b
#> 65 f01bdd17-4902-40f5-86e3-240d66dd2587
#> 66 e6a11140-2545-46bc-929e-da243eed2cae
#> 67 e5c63d94-593c-4338-a489-e1048599e751
#> 68 d8732da6-8d1d-42d9-b625-f2416c30054b
#> 69 d77ec7d6-ef2e-49d6-9e79-05b7f8881484
#> 70 cee11228-9f0b-4e57-afe2-cfe15ee56312
#> 71 a357414d-2042-4eb5-95f0-c58604a18bdd
#> 72 a2d4d33e-4c62-4361-b80a-9be53d2e50e8
#> 73 a0754256-f44b-4c4a-962c-a552e47d3fdc
#> 74 983d5ec9-40e8-4512-9e65-a572a9c486cb
#> 75 7357cee7-9f7f-4ab0-8cec-90de8f047e38
#> 76 6ec405bb-4727-4c6d-ab4e-01fe489af7ea
#> 77 6d41668c-168c-4500-b06a-4674ccf3e19d
#> 78 5e5e7a2f-8f1c-42ac-90dc-b4f80f38e84c
#> 79 55cf0ea3-9d2b-4294-871e-bb4b49a79fc7
#> 80 4f1555bc-4664-46c3-a606-78d34dd10d92
#> 81 2ba40233-8576-4dec-a5f1-2adfa115e2dc
#> 82 2423ce2c-3149-4cca-a2ff-cf682ea29b5f
#> 83 1c9eb291-6d31-47e1-96b2-129b5e1ae64f
#> 84 18eb630b-a754-4111-8cd4-c24ec80aa5ec
#> 85 0d2ee4ac-05ee-40b2-afb6-ebb584caa867
#>                                                                                                                               dataset_title
#> 1                                                                                                                   Human: Great apes study
#> 2                                                                                                           Dissection: Angular gyrus (AnG)
#> 3                                                                                                    Supercluster: CGE-derived interneurons
#> 4                                                                                                   Dissection: Primary auditory cortex(A1)
#> 5                                                                                      Supercluster: Deep layer (non-IT) excitatory neurons
#> 6                                                                                            Supercluster: IT-projecting excitatory neurons
#> 7                                                                                               Dissection: Anterior cingulate cortex (ACC)
#> 8                                                                                                   Human Multiple Cortical Areas SMART-seq
#> 9                                                                                                    Supercluster: MGE-derived interneurons
#> 10                                                                                            Dissection: Primary somatosensory cortex (S1)
#> 11                                                                                                    Dissection: Primary visual cortex(V1)
#> 12                                                                                         Dissection: Dorsolateral prefrontal cortex (DFC)
#> 13                                                                                                    Dissection: Primary motor cortex (M1)
#> 14                                                                                                         Supercluster: Non-neuronal cells
#> 15                                                                                                  Dissection: Middle temporal gyrus (MTG)
#> 16                                                                       Combined single cell and single nuclei RNA-Seq data - Heart Global
#> 17                                                                                                    Global dataset of infant KMT2Ar B-ALL
#> 18                                                                                     Normal immune cells landscape of infant KMT2Ar B-ALL
#> 19                                                                                                      Human Human Microglia 10x scRNA-seq
#> 20                                                                                                    Human Endothelial cells 10x scRNA-seq
#> 21                                                                                                 Human Nurr-Negative Nuclei 10x scRNA-seq
#> 22                                                                                                 Human Nurr-Positive Nuclei 10x scRNA-seq
#> 23                                                                                                     Human Oligodendrocytes 10x scRNA-seq
#> 24                                                                                                            Human OPC Cells 10x scRNA-seq
#> 25                                                                                                           Human DA Neurons 10x scRNA-seq
#> 26                                                                                                       Human Non-DA Neurons 10x scRNA-seq
#> 27                                                                                                           Human Astrocytes 10x scRNA-seq
#> 28                                                                              An Integrated Single Cell Meta-atlas of Human Periodontitis
#> 29                                                                Single-cell analysis of prenatal and postnatal human cortical development
#> 30                                                       All - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 31                                                                                    snRNA-seq of human anterior and posterior hippocampus
#> 32                                                                                                                        3-prime FGID data
#> 33                                                      Single-Cell RNA Sequencing of Breast Tissues: Cell Subtypes and Cancer Risk Factors
#> 34                                                                            Sst Chodl - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 35                                                                                  L6b - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 36                                                                              L5/6 NP - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 37                                                                                 Sncg - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 38                                                                                L6 CT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 39                                                                           Lamp5 Lhx6 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 40                                                                                L4 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 41                                                                      Oligodendrocyte - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 42                                                                            Astrocyte - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 43                                                                       Whole Taxonomy - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 44                                                                                L5 ET - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 45                                                                              L2/3 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 46                                                                                L6 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 47                                                                                  OPC - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 48                                                                                  Vip - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 49                                                                                L5 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 50                                                                          Endothelial - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 51                                                                                 VLMC - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 52                                                                           L6 IT Car3 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 53                                                                        Microglia-PVM - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 54                                                                                Lamp5 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 55                                                                                 Pax6 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 56                                                                                Pvalb - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 57                                                                           Chandelier - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 58                                                                                  Sst - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 59                                                                                                                   donor_p13_trophoblasts
#> 60                                                                                                                  All donors trophoblasts
#> 61                                                                                                     All donors all cell states (in vivo)
#> 62                                                                     Single-cell transcriptomic datasets of Renal cell carcinoma patients
#> 63 Single-cell sequencing links multiregional immune landscapes and tissue-resident T cells in ccRCC to tumor topology and therapy efficacy
#> 64                                                                                                                Tabula Sapiens - Pancreas
#> 65                                                                                                          Tabula Sapiens - Salivary_Gland
#> 66                                                                                                                   Tabula Sapiens - Heart
#> 67                                                                                                                 Tabula Sapiens - Bladder
#> 68                                                                                                                 Tabula Sapiens - Trachea
#> 69                                                                                                                Tabula Sapiens - Prostate
#> 70                                                                                                                  Tabula Sapiens - Spleen
#> 71                                                                                                         Tabula Sapiens - Small_Intestine
#> 72                                                                                                             Tabula Sapiens - Vasculature
#> 73                                                                                                                     Tabula Sapiens - Eye
#> 74                                                                                                                   Tabula Sapiens - Blood
#> 75                                                                                                         Tabula Sapiens - Large_Intestine
#> 76                                                                                                                  Tabula Sapiens - Uterus
#> 77                                                                                                                   Tabula Sapiens - Liver
#> 78                                                                                                                     Tabula Sapiens - Fat
#> 79                                                                                                                  Tabula Sapiens - Tongue
#> 80                                                                                                             Tabula Sapiens - Bone_Marrow
#> 81                                                                                                                 Tabula Sapiens - Mammary
#> 82                                                                                                                  Tabula Sapiens - Kidney
#> 83                                                                                                                  Tabula Sapiens - Muscle
#> 84                                                                                                              Tabula Sapiens - Lymph_Node
#> 85                                                                                                                    Tabula Sapiens - Lung
#>    dataset_total_cell_count
#> 1                    156285
#> 2                    110752
#> 3                    129495
#> 4                    139054
#> 5                     92969
#> 6                    638941
#> 7                    135462
#> 8                     49417
#> 9                    185477
#> 10                   153159
#> 11                   241077
#> 12                   113339
#> 13                   114605
#> 14                   108940
#> 15                   148374
#> 16                   493236
#> 17                   128588
#> 18                    36313
#> 19                    33041
#> 20                    14903
#> 21                   104097
#> 22                    80576
#> 23                   178815
#> 24                    13691
#> 25                    22048
#> 26                    91479
#> 27                    33506
#> 28                   105918
#> 29                   700391
#> 30                   356213
#> 31                   129905
#> 32                    89849
#> 33                    52681
#> 34                     1772
#> 35                    17996
#> 36                    18154
#> 37                    23640
#> 38                    27454
#> 39                    21603
#> 40                    76195
#> 41                   136076
#> 42                    82936
#> 43                  1309414
#> 44                     3848
#> 45                   317116
#> 46                    44174
#> 47                    27670
#> 48                    95014
#> 49                    97173
#> 50                     2496
#> 51                     4619
#> 52                    13007
#> 53                    40625
#> 54                    52828
#> 55                     8984
#> 56                   109618
#> 57                    14871
#> 58                    71545
#> 59                    31497
#> 60                    67070
#> 61                   286326
#> 62                   270855
#> 63                   167283
#> 64                    13497
#> 65                    27199
#> 66                    11505
#> 67                    24583
#> 68                     9522
#> 69                    16375
#> 70                    34004
#> 71                    12467
#> 72                    16037
#> 73                    10650
#> 74                    50115
#> 75                    13680
#> 76                     7124
#> 77                     5007
#> 78                    20263
#> 79                    15020
#> 80                    12297
#> 81                    11375
#> 82                     9641
#> 83                    30746
#> 84                    53275
#> 85                    35682
#>  [ reached 'max' / getOption("max.print") -- omitted 566 rows ]

The sum of cell counts across all datasets should match the number of cells across all SOMA experiments (human, mouse).

census_data <- census$get("census_data")
all_experiments <- lapply(census_data$to_list(), function(x) census_data$get(x$name))
print(all_experiments)
#> $homo_sapiens
#> <SOMAExperiment>
#>   uri: s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens 
#>   arrays: obs* 
#>   groups: ms* 
#> 
#> $mus_musculus
#> <SOMAExperiment>
#>   uri: s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/mus_musculus 
#>   arrays: obs* 
#>   groups: ms*
experiments_total_cells <- sum(sapply(all_experiments, function(x) {
  nrow(x$obs$read(column_names = c("soma_joinid"))$concat())
}))

print(paste("Found", experiments_total_cells, "cells in all experiments."))
#> [1] "Found 68683222 cells in all experiments."
print(paste(
  "Found", sum(as.vector(census_datasets$dataset_total_cell_count)),
  "cells in all datasets."
))
#> [1] "Found 68683222 cells in all datasets."

Fetching the expression data from a single dataset

Let’s pick one dataset to slice out of the census, and turn into a Seurat in-memory object. (This requires the Seurat package to have been installed beforehand.)

census_datasets[census_datasets$dataset_id == "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149", ]
#>     soma_joinid                        collection_id    collection_name
#> 581         580 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis
#>                collection_doi                           dataset_id
#> 581 10.1038/s41586-020-2496-1 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149
#>                       dataset_version_id
#> 581 ff352f35-58a2-4962-b716-649d1f9e9f44
#>                                                                                        dataset_title
#> 581 Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
#>                             dataset_h5ad_path dataset_total_cell_count
#> 581 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad                    40220

Create a query on the mouse experiment, “RNA” measurement, for the dataset_id.

library("tiledbsoma")
obs_query <- SOMAAxisQuery$new(
  value_filter = "dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'"
)
expt_query <- census_data$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = obs_query
)
dataset_seurat <- expt_query$to_seurat(c(counts = "raw"))
print(dataset_seurat)
#> An object of class Seurat 
#> 52417 features across 40220 samples within 1 assay 
#> Active assay: RNA (52417 features, 0 variable features)
#>  2 layers present: counts, data
#>  1 dimensional reduction calculated: scvi

Downloading the original source H5AD file of a dataset

You can use the cellxgene.census::get_source_h5ad_uri() API to fetch a URI pointing to the H5AD associated with this dataset_id. This is the same H5AD you can download from CZ CELLxGENE Discover, and may contain additional data-submitter provided information which was not included in the Census.

To do this you can fetch the location in the cloud or directly download to your system.

# Option 1: Direct download
download_source_h5ad(
  dataset_id = "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149",
  file = "/tmp/Tabula_Muris_Senis-bone_marrow.h5ad",
  overwrite = TRUE
)
# Option 2: Get location and download via preferred method
get_source_h5ad_uri("0bd1a1de-3aee-40e0-b2ec-86c7a30c7149")
#> $uri
#> [1] "s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/h5ads/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad"
#> 
#> $s3_region
#> [1] "us-west-2"

The local H5AD file can be used in R using SeuratDisk’s anndata converter.

Close the census

After use, the census object should be closed to release memory and other resources.

census$close()

This also closes all SOMA objects accessed via the top-level census. Closing can be automated using on.exit(census$close(), add = TRUE) immediately after census <- open_soma().