This tutorial demonstrates basic use of the census_datasets
data frame that contains metadata of the Census source datasets. This metadata can be joined to the cell metadata data frame (obs
) via the column dataset_id.
Contents
- Fetching the datasets table.
- Fetching the expression data from a single dataset.
- Downloading the original source H5AD file of a dataset.
Fetching the datasets table
Each Census contains a top-level data frame itemizing the datasets contained therein. You can read this SOMADataFrame
into an Arrow Table:
library("cellxgene.census")
census <- open_soma()
census_datasets <- census$get("census_info")$get("datasets")$read()$concat()
print(census_datasets)
#> Table
#> 812 rows x 10 columns
#> $soma_joinid <int64 not null>
#> $citation <large_string not null>
#> $collection_id <large_string not null>
#> $collection_name <large_string not null>
#> $collection_doi <large_string not null>
#> $dataset_id <large_string not null>
#> $dataset_version_id <large_string not null>
#> $dataset_title <large_string not null>
#> $dataset_h5ad_path <large_string not null>
#> $dataset_total_cell_count <int64 not null>
and then an R data frame:
census_datasets <- as.data.frame(census_datasets)
print(census_datasets[, c(
"dataset_id",
"dataset_title",
"dataset_total_cell_count"
)])
#> dataset_id
#> 1 0895c838-e550-48a3-a777-dbcd35d30272
#> 2 00ff600e-6e2e-4d76-846f-0eec4f0ae417
#> 3 bdacc907-7c26-419f-8808-969eab3ca2e8
#> 4 a5d95a42-0137-496f-8a60-101e17f263c8
#> 5 d3566d6a-a455-4a15-980f-45eb29114cab
#> 6 de17ac25-550a-4018-be75-bbb485a0636e
#> 7 9f1049ac-f8b7-45ad-8e31-6e96c3e5058f
#> 8 703f00e6-b996-48e5-bc34-00c41b9876f4
#> 9 e347396c-a7ff-4691-9f7a-99a43555ca18
#> 10 524e045e-e74c-4e00-9884-d5c3bef3d862
#> 11 a49d9109-1d0c-4b36-8139-19aa9a83428c
#> 12 0c9a8cfb-6649-4d52-b418-6d8e56bd7afe
#> 13 06b91002-4d3d-4d2e-8484-20c3b31e232c
#> 14 4993d61c-1d04-4630-9c61-8d9431f39adc
#> 15 b94e3bdf-a385-49cc-b312-7a63cc28b77a
#> 16 df287f8d-f50d-4620-ab96-489d559e6adc
#> 17 eec804b9-2ae5-44f0-a1b5-d721e21257de
#> 18 f9ad5649-f372-43e1-a3a8-423383e5a8a2
#> 19 a810e511-c18b-4b2a-8fdf-98a6a0d433a7
#> 20 ea426edb-4e86-4c53-ab17-5b952d94a31e
#> 21 04d87de6-c20a-4186-8884-f47dba20b0a4
#> 22 b25f3834-69b3-4d87-a272-3938432d1f30
#> 23 94423ec1-21f8-40e8-b5c9-c3ea82350ca4
#> 24 2f6a20f1-173d-4b8d-860b-c47ffea120fa
#> 25 da684768-fb01-455b-9f0f-b63a3e2f844f
#> 26 a65bcc2d-4243-44c1-a262-ab7dcddfcf86
#> 27 75a881cf-5d88-46e2-bf9b-97e5cbc1bd56
#> 28 0c86f0de-ddcb-454c-b00b-37feb69e7da1
#> 29 9f049476-2431-4645-a2d6-f6e85892b603
#> 30 0380ddce-c31b-422a-88fe-34a1945bd949
#> 31 1009f384-b12d-448e-ba9f-1b7d2ecfbb4e
#> 32 3b6ed41e-10a1-47dd-b995-8cde7d041fd6
#> 33 e763ed0d-0e5a-4b8e-9514-6da3d9e47956
#> 34 d95ab381-2b7c-4885-b168-0097ed4e397f
#> 35 b3a5a10f-b1cb-4e8e-abce-bf345448625b
#> 36 e5f5d954-cf0e-4bd8-9346-8d1ddf15a08b
#> 37 62de80d7-e6c4-4ff0-ad4c-d3d36f57cb93
#> 38 9cfee1e6-b24f-433d-a269-f01841655d6a
#> 39 4269074c-f2c1-4d88-b2c3-0946f59d5449
#> 40 7d98cc44-b090-4dc8-804f-2750c84fe9d7
#> 41 c3d381b2-3104-444e-8ad5-d3524407bbb6
#> 42 66d15835-5dc8-4e96-b0eb-f48971cb65e8
#> 43 8a554710-08bc-4005-87cd-da9675bdc2e7
#> 44 ce009dc1-ac57-4386-b72f-5c575701c253
#> 45 dbf0bd35-87f8-4b25-bc90-a3c54f379907
#> 46 6de332e1-465e-4243-9412-6fdc7497e99d
#> 47 faed4f71-6b50-4fc7-bd1c-8f385dccfdce
#> 48 c8f83821-a242-4ed7-86e9-7da077f5d348
#> 49 731e6380-879f-4b0b-9a1f-2150208852ef
#> 50 774c18c5-efa1-4dc5-9e5e-2c824bab2e34
#> 51 ea786a06-5855-48b7-80d7-0313a21a2044
#> 52 524179b0-b406-4723-9c46-293ffa77ca81
#> 53 cbd62079-bed8-4aa1-9659-670f9cb51f9d
#> 54 3e87b1fa-472a-401c-8fa8-f31c10437d5f
#> 55 f16f4108-7873-4035-9989-3748da1a7ff1
#> 56 58679288-9ecc-4647-9781-12a3a8f8c6fd
#> 57 cdefb878-7f00-4b9d-9eda-b3652cfac0c8
#> 58 1492eb6b-7d50-4c4d-94ac-c801a7d5555c
#> 59 bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7
#> 60 ee195b7d-184d-4dfa-9b1c-51a7e601ac11
#> 61 9b188f26-c8e1-4a78-af15-622a35a371fc
#> 62 f6dafdd1-d746-407e-8019-4470e02d4cbd
#> 63 24066994-8183-488d-b037-ef6bb524af39
#> 64 84242d25-f656-4ca6-8e8d-f3d2beeba11f
#> 65 8f1bc86b-7976-4826-8602-f5266160ad86
#> 66 2fc9c59f-3cfd-48d9-9b23-e369ea31bff3
#> 67 470565f2-5afc-456a-b617-18e4496c04fd
#> 68 4c6f9f26-5470-455b-8933-c408232fbf56
#> 69 b07e5164-baf6-43d2-bdba-5a249d0da879
#> 70 bf176af2-4432-4391-9b35-e21bd86ca4f8
#> 71 7b3368a5-c1a0-4973-9e75-d95b4150c7da
#> 72 f75f2ff4-2884-4c2d-b375-70de37a34507
#> 73 2ecc72f8-085f-4e86-8692-771f316c54f6
#> 74 cec9f9a5-8832-437d-99af-fb8237cde54b
#> 75 de4e7a0c-91b2-44e4-b382-87da74c9efb6
#> 76 20d87640-4be8-487f-93d4-dce38378d00f
#> 77 81e91ff8-f619-4ad1-a0c3-b45e1dc63f68
#> 78 03d5794d-cde9-4769-a1a9-b3899d2b1d87
#> 79 e2529f66-d051-4670-b34a-7dca4e474f9f
#> 80 dc30c3ec-46d6-4cd8-8ec1-b544a3d0f503
#> 81 389b1fd4-2b65-4f60-baba-feeb17507665
#> 82 cd77258f-b08b-4c89-b93f-6e6f146b1a4d
#> 83 04b0eb97-d816-44bb-93a5-8b2968791aa0
#> 84 51f476f7-b24d-42f3-8871-7dab3fa35e96
#> 85 6dafb698-7a53-4699-ad13-1b5e2c164be7
#> dataset_title
#> 1 Healthy human liver: B cells
#> 2 Human tonsil nonlymphoid cells scRNA
#> 3 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG microglia
#> 4 Steady-state B cells - scRNA-seq
#> 5 blood and bone marrow from a healthy young donor
#> 6 Myeloid cells of human eye
#> 7 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC microglia
#> 8 PNS
#> 9 Stellate cells from human healthy donor liver samples
#> 10 Healthy human liver: hepatic stellate cells
#> 11 Healthy human liver: lymphocytes
#> 12 Cholangiocytes from human healthy donor liver samples
#> 13 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC astrocytes
#> 14 B cells from human healthy donor liver samples
#> 15 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG astrocytes
#> 16 Urethral luminal epithelia are castration-insensitive cells of the proximal prostate - Human Fibromuscular Stromal Cells
#> 17 74 years old female - Airway Wash (5 days post-intubation)
#> 18 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC oligodendrocyte
#> 19 Urethral luminal epithelia are castration-insensitive cells of the proximal prostate - Mouse Fibromuscular Stromal Cells
#> 20 A Cellular Anatomy of the Normal Adult Human Prostate and Prostatic Urethra - Human Fibromuscular Stromal Cells
#> 21 A single-cell and spatially-resolved atlas of human breast cancers - B_cells
#> 22 82 years old female - Airway Wash (1 day post-intubation)
#> 23 UMAP of Myeloid cells
#> 24 Horizontal cells of the human fovea and peripheral retina
#> 25 UMAP visualization of fibroblast subclusters
#> 26 74 years old female - Airway Wash (7 days post-intubation)
#> 27 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC inhibitory neurons
#> 28 A single-cell and spatially-resolved atlas of human breast cancers - Plasmablasts
#> 29 Healthy human liver: cholangiocytes
#> 30 Aorta <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 31 Neuronal <U+2014> Cells of the adult human heart
#> 32 Healthy human liver: macrophages
#> 33 Platelet sub_clusters of COVID-19 Immune Altas: Integration of 5 public COVID-19 PBMC single-cell datasets
#> 34 Cone cells of human eye
#> 35 15 leukemic bone marrow donors
#> 36 49 years old male - Airway Wash (3 days post-intubation)
#> 37 66 years old female - Airway Wash (4 days post-intubation)
#> 38 Retinal pigment epithelial cells of human eye
#> 39 Spatiotemporal analysis of human intestinal development at single-cell resolution: Immune
#> 40 49 years old male - Airway Wash (1 day post-intubation)
#> 41 Horizontal cells of human eye
#> 42 Single cell transcriptome analysis of human pancreas reveals transcriptional signatures of aging and somatic mutation patterns
#> 43 82 years old female - Fresh PBMCs (1 day post-intubation)
#> 44 74 years old female - Airway Wash (6 days post-intubation)
#> 45 66 years old female - Airway Wash (3 days post-intubation)
#> 46 e12.5 thalamic progenitors
#> 47 UMAP of Endothelial cells
#> 48 white matter - astrocytes
#> 49 74 years old female - Airway Wash (8 days post-intubation)
#> 50 Infiltrating Neoplastic Cells Human Glioblastoma
#> 51 66 years old female - Fresh PBMCs (3 days post-intubation)
#> 52 Kidney - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
#> 53 Diaphragm <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 54 Endothelial - MTG: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 55 Supercluster: Committed oligodendrocyte precursor
#> 56 Spatiotemporal analysis of human intestinal development at single-cell resolution: Myofibroblasts and Mesothelium
#> 57 Cilium
#> 58 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG inhibitory neurons
#> 59 DCM/ACM heart cell atlas: Adipocytes
#> 60 Adult duodenum
#> 61 UMAP of Fibroblasts cells
#> 62 B cells
#> 63 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: SFG oligodendrocyte
#> 64 white matter - microglia
#> 65 Brown adipose tissue <U+2014> A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 66 Mature kidney dataset: non PT parenchyma
#> 67 Dissection: Midbrain (RN) - Red Nucleus - RN
#> 68 chRCC - Single-cell analyses of renal cell cancers reveal insights into tumor microenvironment, cell of origin, and therapy response
#> 69 A Single-Cell Transcriptome Atlas of the Human Pancreas
#> 70 Healthy human liver: liver sinusoidal endothelial cells
#> 71 B cells
#> 72 Adipocytes <U+2014> Cells of the adult human heart
#> 73 Endothelial - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 74 Retinal ganglion cells of human eye
#> 75 Large intestine - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
#> 76 Mature kidney dataset: immune
#> 77 Sst Chodl - MTG: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 78 UMAP of Plasma and B-Cells cells
#> 79 A Single-Cell Atlas of Mouse White Adipose Tissue - Mouse vascular cells
#> 80 Oligodendrocytes in MS
#> 81 vasculature
#> 82 Molecular characterization of selectively vulnerable neurons in Alzheimer<U+2019>s Disease: EC excitatory neurons
#> 83 Spatiotemporal analysis of human intestinal development at single-cell resolution: Pericytes
#> 84 74 years old female - Airway Wash (4 days post-intubation)
#> 85 Supercluster: Ependymal
#> dataset_total_cell_count
#> 1 146
#> 2 363
#> 3 3799
#> 4 1324
#> 5 15502
#> 6 395
#> 7 5572
#> 8 649
#> 9 1417
#> 10 1374
#> 11 2346
#> 12 1011
#> 13 5500
#> 14 1250
#> 15 5970
#> 16 1295
#> 17 1324
#> 18 8168
#> 19 1647
#> 20 2113
#> 21 3206
#> 22 1074
#> 23 3282
#> 24 2868
#> 25 2303
#> 26 810
#> 27 5331
#> 28 3524
#> 29 1861
#> 30 906
#> 31 3961
#> 32 3977
#> 33 7274
#> 34 1378
#> 35 31586
#> 36 2487
#> 37 2598
#> 38 1635
#> 39 2199
#> 40 2489
#> 41 1875
#> 42 2544
#> 43 4232
#> 44 2834
#> 45 3434
#> 46 1789
#> 47 3951
#> 48 3596
#> 49 2065
#> 50 3589
#> 51 4792
#> 52 1833
#> 53 1858
#> 54 1973
#> 55 4720
#> 56 2833
#> 57 1641
#> 58 7506
#> 59 2576
#> 60 5200
#> 61 5754
#> 62 3699
#> 63 15772
#> 64 3851
#> 65 2223
#> 66 4620
#> 67 4714
#> 68 2576
#> 69 2126
#> 70 6289
#> 71 4138
#> 72 3799
#> 73 2496
#> 74 1777
#> 75 1887
#> 76 7803
#> 77 1310
#> 78 9015
#> 79 7632
#> 80 17799
#> 81 959
#> 82 8362
#> 83 4163
#> 84 4147
#> 85 5882
#> [ reached 'max' / getOption("max.print") -- omitted 727 rows ]
The sum of cell counts across all datasets should match the number of cells across all SOMA experiments (human, mouse).
census_data <- census$get("census_data")
all_experiments <- lapply(census_data$to_list(), function(x) census_data$get(x$name))
print(all_experiments)
#> $homo_sapiens
#> <SOMAExperiment>
#> uri: s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens
#> arrays: obs*
#> groups: ms*
#>
#> $mus_musculus
#> <SOMAExperiment>
#> uri: s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/mus_musculus
#> arrays: obs*
#> groups: ms*
experiments_total_cells <- sum(sapply(all_experiments, function(x) {
nrow(x$obs$read(column_names = c("soma_joinid"))$concat())
}))
print(paste("Found", experiments_total_cells, "cells in all experiments."))
#> [1] "Found 115556140 cells in all experiments."
print(paste(
"Found", sum(as.vector(census_datasets$dataset_total_cell_count)),
"cells in all datasets."
))
#> [1] "Found 115556140 cells in all datasets."
Fetching the expression data from a single dataset
Let’s pick one dataset to slice out of the census, and turn into a Seurat in-memory object. (This requires the Seurat
package to have been installed beforehand.)
census_datasets[census_datasets$dataset_id == "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149", ]
#> soma_joinid
#> 514 513
#> citation
#> 514 Publication: https://doi.org/10.1038/s41586-020-2496-1 Dataset Version: https://datasets.cellxgene.cziscience.com/64d581c0-2683-44f4-b65b-019e679a33e8.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/0b9d8a04-bb9d-44da-aa27-705bb65b54eb
#> collection_id collection_name collection_doi
#> 514 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis 10.1038/s41586-020-2496-1
#> dataset_id dataset_version_id
#> 514 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149 64d581c0-2683-44f4-b65b-019e679a33e8
#> dataset_title
#> 514 Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
#> dataset_h5ad_path dataset_total_cell_count
#> 514 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad 40220
Create a query on the mouse experiment, “RNA” measurement, for the dataset_id
.
library("tiledbsoma")
obs_query <- SOMAAxisQuery$new(
value_filter = "dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'"
)
expt_query <- census_data$get("mus_musculus")$axis_query(
measurement_name = "RNA",
obs_query = obs_query
)
dataset_seurat <- expt_query$to_seurat(c(counts = "raw"))
print(dataset_seurat)
#> An object of class Seurat
#> 52437 features across 40220 samples within 1 assay
#> Active assay: RNA (52437 features, 0 variable features)
#> 2 layers present: counts, data
Downloading the original source H5AD file of a dataset
You can use the cellxgene.census::get_source_h5ad_uri()
API to fetch a URI pointing to the H5AD associated with this dataset_id
. This is the same H5AD you can download from CZ CELLxGENE Discover, and may contain additional data-submitter provided information which was not included in the Census.
To do this you can fetch the location in the cloud or directly download to your system.
# Option 1: Direct download
download_source_h5ad(
dataset_id = "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149",
file = "/tmp/Tabula_Muris_Senis-bone_marrow.h5ad",
overwrite = TRUE
)
# Option 2: Get location and download via preferred method
get_source_h5ad_uri("0bd1a1de-3aee-40e0-b2ec-86c7a30c7149")
#> $uri
#> [1] "s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/h5ads/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad"
#>
#> $s3_region
#> [1] "us-west-2"
The local H5AD file can be used in R using SeuratDisk’s anndata converter.