Census Datasets example • cellxgene.census

This tutorial demonstrates basic use of the census_datasets data frame that contains metadata of the Census source datasets. This metadata can be joined to the cell metadata data frame (obs) via the column dataset_id.

Contents

Fetching the datasets table.
Fetching the expression data from a single dataset.
Downloading the original source H5AD file of a dataset.

Fetching the datasets table

Each Census contains a top-level data frame itemizing the datasets contained therein. You can read this SOMADataFrame into an Arrow Table:

library("cellxgene.census")
census <- open_soma()
census_datasets <- census$get("census_info")$get("datasets")$read()$concat()
print(census_datasets)
#> Table
#> 812 rows x 10 columns
#> $soma_joinid <int64 not null>
#> $citation <large_string not null>
#> $collection_id <large_string not null>
#> $collection_name <large_string not null>
#> $collection_doi <large_string not null>
#> $dataset_id <large_string not null>
#> $dataset_version_id <large_string not null>
#> $dataset_title <large_string not null>
#> $dataset_h5ad_path <large_string not null>
#> $dataset_total_cell_count <int64 not null>

and then an R data frame:

census_datasets <- as.data.frame(census_datasets)
print(census_datasets[, c(
  "dataset_id",
  "dataset_title",
  "dataset_total_cell_count"
)])
#>                              dataset_id
#> 1  0895c838-e550-48a3-a777-dbcd35d30272
#> 2  00ff600e-6e2e-4d76-846f-0eec4f0ae417
#> 3  bdacc907-7c26-419f-8808-969eab3ca2e8
#> 4  a5d95a42-0137-496f-8a60-101e17f263c8
#> 5  d3566d6a-a455-4a15-980f-45eb29114cab
#> 6  de17ac25-550a-4018-be75-bbb485a0636e
#> 7  9f1049ac-f8b7-45ad-8e31-6e96c3e5058f
#> 8  703f00e6-b996-48e5-bc34-00c41b9876f4
#> 9  e347396c-a7ff-4691-9f7a-99a43555ca18
#> 10 524e045e-e74c-4e00-9884-d5c3bef3d862
#> 11 a49d9109-1d0c-4b36-8139-19aa9a83428c
#> 12 0c9a8cfb-6649-4d52-b418-6d8e56bd7afe
#> 13 06b91002-4d3d-4d2e-8484-20c3b31e232c
#> 14 4993d61c-1d04-4630-9c61-8d9431f39adc
#> 15 b94e3bdf-a385-49cc-b312-7a63cc28b77a
#> 16 df287f8d-f50d-4620-ab96-489d559e6adc
#> 17 eec804b9-2ae5-44f0-a1b5-d721e21257de
#> 18 f9ad5649-f372-43e1-a3a8-423383e5a8a2
#> 19 a810e511-c18b-4b2a-8fdf-98a6a0d433a7
#> 20 ea426edb-4e86-4c53-ab17-5b952d94a31e
#> 21 04d87de6-c20a-4186-8884-f47dba20b0a4
#> 22 b25f3834-69b3-4d87-a272-3938432d1f30
#> 23 94423ec1-21f8-40e8-b5c9-c3ea82350ca4
#> 24 2f6a20f1-173d-4b8d-860b-c47ffea120fa
#> 25 da684768-fb01-455b-9f0f-b63a3e2f844f
#> 26 a65bcc2d-4243-44c1-a262-ab7dcddfcf86
#> 27 75a881cf-5d88-46e2-bf9b-97e5cbc1bd56
#> 28 0c86f0de-ddcb-454c-b00b-37feb69e7da1
#> 29 9f049476-2431-4645-a2d6-f6e85892b603
#> 30 0380ddce-c31b-422a-88fe-34a1945bd949
#> 31 1009f384-b12d-448e-ba9f-1b7d2ecfbb4e
#> 32 3b6ed41e-10a1-47dd-b995-8cde7d041fd6
#> 33 e763ed0d-0e5a-4b8e-9514-6da3d9e47956
#> 34 d95ab381-2b7c-4885-b168-0097ed4e397f
#> 35 b3a5a10f-b1cb-4e8e-abce-bf345448625b
#> 36 e5f5d954-cf0e-4bd8-9346-8d1ddf15a08b
#> 37 62de80d7-e6c4-4ff0-ad4c-d3d36f57cb93
#> 38 9cfee1e6-b24f-433d-a269-f01841655d6a
#> 39 4269074c-f2c1-4d88-b2c3-0946f59d5449
#> 40 7d98cc44-b090-4dc8-804f-2750c84fe9d7
#> 41 c3d381b2-3104-444e-8ad5-d3524407bbb6
#> 42 66d15835-5dc8-4e96-b0eb-f48971cb65e8
#> 43 8a554710-08bc-4005-87cd-da9675bdc2e7
#> 44 ce009dc1-ac57-4386-b72f-5c575701c253
#> 45 dbf0bd35-87f8-4b25-bc90-a3c54f379907
#> 46 6de332e1-465e-4243-9412-6fdc7497e99d
#> 47 faed4f71-6b50-4fc7-bd1c-8f385dccfdce
#> 48 c8f83821-a242-4ed7-86e9-7da077f5d348
#> 49 731e6380-879f-4b0b-9a1f-2150208852ef
#> 50 774c18c5-efa1-4dc5-9e5e-2c824bab2e34
#> 51 ea786a06-5855-48b7-80d7-0313a21a2044
#> 52 524179b0-b406-4723-9c46-293ffa77ca81
#> 53 cbd62079-bed8-4aa1-9659-670f9cb51f9d
#> 54 3e87b1fa-472a-401c-8fa8-f31c10437d5f
#> 55 f16f4108-7873-4035-9989-3748da1a7ff1
#> 56 58679288-9ecc-4647-9781-12a3a8f8c6fd
#> 57 cdefb878-7f00-4b9d-9eda-b3652cfac0c8
#> 58 1492eb6b-7d50-4c4d-94ac-c801a7d5555c
#> 59 bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7
#> 60 ee195b7d-184d-4dfa-9b1c-51a7e601ac11
#> 61 9b188f26-c8e1-4a78-af15-622a35a371fc
#> 62 f6dafdd1-d746-407e-8019-4470e02d4cbd
#> 63 24066994-8183-488d-b037-ef6bb524af39
#> 64 84242d25-f656-4ca6-8e8d-f3d2beeba11f
#> 65 8f1bc86b-7976-4826-8602-f5266160ad86
#> 66 2fc9c59f-3cfd-48d9-9b23-e369ea31bff3
#> 67 470565f2-5afc-456a-b617-18e4496c04fd
#> 68 4c6f9f26-5470-455b-8933-c408232fbf56
#> 69 b07e5164-baf6-43d2-bdba-5a249d0da879
#> 70 bf176af2-4432-4391-9b35-e21bd86ca4f8
#> 71 7b3368a5-c1a0-4973-9e75-d95b4150c7da
#> 72 f75f2ff4-2884-4c2d-b375-70de37a34507
#> 73 2ecc72f8-085f-4e86-8692-771f316c54f6
#> 74 cec9f9a5-8832-437d-99af-fb8237cde54b
#> 75 de4e7a0c-91b2-44e4-b382-87da74c9efb6
#> 76 20d87640-4be8-487f-93d4-dce38378d00f
#> 77 81e91ff8-f619-4ad1-a0c3-b45e1dc63f68
#> 78 03d5794d-cde9-4769-a1a9-b3899d2b1d87
#> 79 e2529f66-d051-4670-b34a-7dca4e474f9f
#> 80 dc30c3ec-46d6-4cd8-8ec1-b544a3d0f503
#> 81 389b1fd4-2b65-4f60-baba-feeb17507665
#> 82 cd77258f-b08b-4c89-b93f-6e6f146b1a4d
#> 83 04b0eb97-d816-44bb-93a5-8b2968791aa0
#> 84 51f476f7-b24d-42f3-8871-7dab3fa35e96
#> 85 6dafb698-7a53-4699-ad13-1b5e2c164be7
#>                                                                                                                           dataset_title
#> 1                                                                                                          Healthy human liver: B cells
#> 2                                                                                                  Human tonsil nonlymphoid cells scRNA
#> 3                             Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG microglia
#> 4                                                                                                      Steady-state B cells - scRNA-seq
#> 5                                                                                      blood and bone marrow from a healthy young donor
#> 6                                                                                                            Myeloid cells of human eye
#> 7                              Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC microglia
#> 8                                                                                                                                   PNS
#> 9                                                                                 Stellate cells from human healthy donor liver samples
#> 10                                                                                          Healthy human liver: hepatic stellate cells
#> 11                                                                                                     Healthy human liver: lymphocytes
#> 12                                                                                Cholangiocytes from human healthy donor liver samples
#> 13                            Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC astrocytes
#> 14                                                                                       B cells from human healthy donor liver samples
#> 15                           Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG astrocytes
#> 16             Urethral luminal epithelia are castration-insensitive cells of the proximal prostate - Human Fibromuscular Stromal Cells
#> 17                                                                           74 years old female - Airway Wash (5 days post-intubation)
#> 18                       Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC oligodendrocyte
#> 19             Urethral luminal epithelia are castration-insensitive cells of the proximal prostate - Mouse Fibromuscular Stromal Cells
#> 20                      A Cellular Anatomy of the Normal Adult Human Prostate and Prostatic Urethra - Human Fibromuscular Stromal Cells
#> 21                                                         A single-cell and spatially-resolved atlas of human breast cancers - B_cells
#> 22                                                                            82 years old female - Airway Wash (1 day post-intubation)
#> 23                                                                                                                UMAP of Myeloid cells
#> 24                                                                            Horizontal cells of the human fovea and peripheral retina
#> 25                                                                                         UMAP visualization of fibroblast subclusters
#> 26                                                                           74 years old female - Airway Wash (7 days post-intubation)
#> 27                    Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC inhibitory neurons
#> 28                                                    A single-cell and spatially-resolved atlas of human breast cancers - Plasmablasts
#> 29                                                                                                  Healthy human liver: cholangiocytes
#> 30                                          Aorta <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 31                                                                                     Neuronal <U+2014> Cells of the adult human heart
#> 32                                                                                                     Healthy human liver: macrophages
#> 33                           Platelet sub_clusters of COVID-19 Immune Altas: Integration of 5 public COVID-19 PBMC single-cell datasets
#> 34                                                                                                              Cone cells of human eye
#> 35                                                                                                       15 leukemic bone marrow donors
#> 36                                                                             49 years old male - Airway Wash (3 days post-intubation)
#> 37                                                                           66 years old female - Airway Wash (4 days post-intubation)
#> 38                                                                                        Retinal pigment epithelial cells of human eye
#> 39                                            Spatiotemporal analysis of human intestinal development at single-cell resolution: Immune
#> 40                                                                              49 years old male - Airway Wash (1 day post-intubation)
#> 41                                                                                                        Horizontal cells of human eye
#> 42       Single cell transcriptome analysis of human pancreas reveals transcriptional signatures of aging and somatic mutation patterns
#> 43                                                                            82 years old female - Fresh PBMCs (1 day post-intubation)
#> 44                                                                           74 years old female - Airway Wash (6 days post-intubation)
#> 45                                                                           66 years old female - Airway Wash (3 days post-intubation)
#> 46                                                                                                           e12.5 thalamic progenitors
#> 47                                                                                                            UMAP of Endothelial cells
#> 48                                                                                                            white matter - astrocytes
#> 49                                                                           74 years old female - Airway Wash (8 days post-intubation)
#> 50                                                                                     Infiltrating Neoplastic Cells Human Glioblastoma
#> 51                                                                           66 years old female - Fresh PBMCs (3 days post-intubation)
#> 52                                   Kidney - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
#> 53                                      Diaphragm <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 54                                                                        Endothelial - MTG: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 55                                                                                    Supercluster: Committed oligodendrocyte precursor
#> 56                    Spatiotemporal analysis of human intestinal development at single-cell resolution: Myofibroblasts and Mesothelium
#> 57                                                                                                                               Cilium
#> 58                   Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG inhibitory neurons
#> 59                                                                                                 DCM/ACM heart cell atlas: Adipocytes
#> 60                                                                                                                       Adult duodenum
#> 61                                                                                                            UMAP of Fibroblasts cells
#> 62                                                                                                                              B cells
#> 63                      Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG oligodendrocyte
#> 64                                                                                                             white matter - microglia
#> 65                           Brown adipose tissue <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 66                                                                                             Mature kidney dataset: non PT parenchyma
#> 67                                                                                         Dissection: Midbrain (RN) - Red Nucleus - RN
#> 68 chRCC - Single-cell analyses of renal cell cancers reveal insights into tumor microenvironment, cell of origin, and therapy response
#> 69                                                                              A Single-Cell Transcriptome Atlas of the Human Pancreas
#> 70                                                                              Healthy human liver: liver sinusoidal endothelial cells
#> 71                                                                                                                              B cells
#> 72                                                                                   Adipocytes <U+2014> Cells of the adult human heart
#> 73                                                                      Endothelial - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 74                                                                                                  Retinal ganglion cells of human eye
#> 75                          Large intestine - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
#> 76                                                                                                        Mature kidney dataset: immune
#> 77                                                                          Sst Chodl - MTG: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 78                                                                                                     UMAP of Plasma and B-Cells cells
#> 79                                                             A Single-Cell Atlas of Mouse White Adipose Tissue - Mouse vascular cells
#> 80                                                                                                               Oligodendrocytes in MS
#> 81                                                                                                                          vasculature
#> 82                    Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC excitatory neurons
#> 83                                         Spatiotemporal analysis of human intestinal development at single-cell resolution: Pericytes
#> 84                                                                           74 years old female - Airway Wash (4 days post-intubation)
#> 85                                                                                                              Supercluster: Ependymal
#>    dataset_total_cell_count
#> 1                       146
#> 2                       363
#> 3                      3799
#> 4                      1324
#> 5                     15502
#> 6                       395
#> 7                      5572
#> 8                       649
#> 9                      1417
#> 10                     1374
#> 11                     2346
#> 12                     1011
#> 13                     5500
#> 14                     1250
#> 15                     5970
#> 16                     1295
#> 17                     1324
#> 18                     8168
#> 19                     1647
#> 20                     2113
#> 21                     3206
#> 22                     1074
#> 23                     3282
#> 24                     2868
#> 25                     2303
#> 26                      810
#> 27                     5331
#> 28                     3524
#> 29                     1861
#> 30                      906
#> 31                     3961
#> 32                     3977
#> 33                     7274
#> 34                     1378
#> 35                    31586
#> 36                     2487
#> 37                     2598
#> 38                     1635
#> 39                     2199
#> 40                     2489
#> 41                     1875
#> 42                     2544
#> 43                     4232
#> 44                     2834
#> 45                     3434
#> 46                     1789
#> 47                     3951
#> 48                     3596
#> 49                     2065
#> 50                     3589
#> 51                     4792
#> 52                     1833
#> 53                     1858
#> 54                     1973
#> 55                     4720
#> 56                     2833
#> 57                     1641
#> 58                     7506
#> 59                     2576
#> 60                     5200
#> 61                     5754
#> 62                     3699
#> 63                    15772
#> 64                     3851
#> 65                     2223
#> 66                     4620
#> 67                     4714
#> 68                     2576
#> 69                     2126
#> 70                     6289
#> 71                     4138
#> 72                     3799
#> 73                     2496
#> 74                     1777
#> 75                     1887
#> 76                     7803
#> 77                     1310
#> 78                     9015
#> 79                     7632
#> 80                    17799
#> 81                      959
#> 82                     8362
#> 83                     4163
#> 84                     4147
#> 85                     5882
#>  [ reached 'max' / getOption("max.print") -- omitted 727 rows ]

The sum of cell counts across all datasets should match the number of cells across all SOMA experiments (human, mouse).

census_data <- census$get("census_data")
all_experiments <- lapply(census_data$to_list(), function(x) census_data$get(x$name))
print(all_experiments)
#> $homo_sapiens
#> <SOMAExperiment>
#>   uri: s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens 
#>   arrays: obs* 
#>   groups: ms* 
#> 
#> $mus_musculus
#> <SOMAExperiment>
#>   uri: s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/mus_musculus 
#>   arrays: obs* 
#>   groups: ms*

experiments_total_cells <- sum(sapply(all_experiments, function(x) {
  nrow(x$obs$read(column_names = c("soma_joinid"))$concat())
}))

print(paste("Found", experiments_total_cells, "cells in all experiments."))
#> [1] "Found 115556140 cells in all experiments."
print(paste(
  "Found", sum(as.vector(census_datasets$dataset_total_cell_count)),
  "cells in all datasets."
))
#> [1] "Found 115556140 cells in all datasets."

Fetching the expression data from a single dataset

Let’s pick one dataset to slice out of the census, and turn into a Seurat in-memory object. (This requires the Seurat package to have been installed beforehand.)

census_datasets[census_datasets$dataset_id == "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149", ]
#>     soma_joinid
#> 514         513
#>                                                                                                                                                                                                                                                                                                          citation
#> 514 Publication: https://doi.org/10.1038/s41586-020-2496-1 Dataset Version: https://datasets.cellxgene.cziscience.com/64d581c0-2683-44f4-b65b-019e679a33e8.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb
#>                            collection_id    collection_name            collection_doi
#> 514 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis 10.1038/s41586-020-2496-1
#>                               dataset_id                   dataset_version_id
#> 514 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149 64d581c0-2683-44f4-b65b-019e679a33e8
#>                                                                                        dataset_title
#> 514 Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
#>                             dataset_h5ad_path dataset_total_cell_count
#> 514 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad                    40220

Create a query on the mouse experiment, “RNA” measurement, for the dataset_id.

library("tiledbsoma")
obs_query <- SOMAAxisQuery$new(
  value_filter = "dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'"
)
expt_query <- census_data$get("mus_musculus")$axis_query(
  measurement_name = "RNA",
  obs_query = obs_query
)
dataset_seurat <- expt_query$to_seurat(c(counts = "raw"))
print(dataset_seurat)
#> An object of class Seurat 
#> 52437 features across 40220 samples within 1 assay 
#> Active assay: RNA (52437 features, 0 variable features)
#>  2 layers present: counts, data

Downloading the original source H5AD file of a dataset

You can use the cellxgene.census::get_source_h5ad_uri() API to fetch a URI pointing to the H5AD associated with this dataset_id. This is the same H5AD you can download from CZ CELLxGENE Discover, and may contain additional data-submitter provided information which was not included in the Census.

To do this you can fetch the location in the cloud or directly download to your system.

# Option 1: Direct download
download_source_h5ad(
  dataset_id = "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149",
  file = "/tmp/Tabula_Muris_Senis-bone_marrow.h5ad",
  overwrite = TRUE
)

# Option 2: Get location and download via preferred method
get_source_h5ad_uri("0bd1a1de-3aee-40e0-b2ec-86c7a30c7149")
#> $uri
#> [1] "s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/h5ads/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad"
#> 
#> $s3_region
#> [1] "us-west-2"

The local H5AD file can be used in R using SeuratDisk’s anndata converter.

Close the census

After use, the census object should be closed to release memory and other resources.

census$close()

This also closes all SOMA objects accessed via the top-level census. Closing can be automated using on.exit(census$close(), add = TRUE) immediately after census <- open_soma().