This tutorial demonstrates basic use of the census_datasets
data frame that contains metadata of the Census source datasets. This metadata can be joined to the cell metadata data frame (obs
) via the column dataset_id.
Contents
- Fetching the datasets table.
- Fetching the expression data from a single dataset.
- Downloading the original source H5AD file of a dataset.
Fetching the datasets table
Each Census contains a top-level data frame itemizing the datasets contained therein. You can read this SOMADataFrame
into an Arrow Table:
library("cellxgene.census")
census <- open_soma()
census_datasets <- census$get("census_info")$get("datasets")$read()$concat()
print(census_datasets)
#> Table
#> 651 rows x 9 columns
#> $soma_joinid <int64 not null>
#> $collection_id <large_string not null>
#> $collection_name <large_string not null>
#> $collection_doi <large_string not null>
#> $dataset_id <large_string not null>
#> $dataset_version_id <large_string not null>
#> $dataset_title <large_string not null>
#> $dataset_h5ad_path <large_string not null>
#> $dataset_total_cell_count <int64 not null>
and then an R data frame:
census_datasets <- as.data.frame(census_datasets)
print(census_datasets[, c(
"dataset_id",
"dataset_title",
"dataset_total_cell_count"
)])
#> dataset_id
#> 1 2bdd3a2c-2ff4-4314-adf3-8a06b797a33a
#> 2 f5b0810c-1664-4a62-ad06-be1d9964aa8b
#> 3 e4ddac12-f48f-4455-8e8d-c2a48a683437
#> 4 e2808a6e-e2ea-41b9-b38c-4a08f1677f02
#> 5 d01c9dff-abd1-4825-bf30-2eb2ba74597e
#> 6 c3aa4f95-7a18-4a7d-8dd8-ca324d714363
#> 7 be401db3-d732-408a-b0c4-71af0458b8ab
#> 8 a5d5c529-8a1f-40b5-bda3-35208970070d
#> 9 9c63201d-bfd9-41a8-bbbc-18d947556f3d
#> 10 93cb76aa-a84b-4a92-8e6c-66a914e26d4c
#> 11 8d1dd010-5cbc-43fb-83f8-e0de8e8517da
#> 12 716a4acc-919e-4326-9672-ebe06ede84e6
#> 13 5bdc423a-59e6-457d-aa01-debd2c9c564f
#> 14 5346f9c6-755e-4336-94cc-38706ec00c2f
#> 15 015c230d-650c-4527-870d-8a805849a382
#> 16 d567b692-c374-4628-a508-8008f6778f22
#> 17 cf83c98a-3791-4537-bbde-a719f6d73c13
#> 18 738942eb-ac72-44ff-a64b-8943b5ecd8d9
#> 19 f8d8b443-bca6-4c3c-9042-669dfb7f8030
#> 20 f5be4b96-f5a3-4c3d-84ac-6f69daf744d5
#> 21 dea1aa78-c0a2-413f-b375-f91cce49e4d0
#> 22 92161459-9103-4379-ae34-73a38eee1d1d
#> 23 5829c7ba-697f-418e-8b98-d605b192dc48
#> 24 4dd1cd23-fc4d-4fd1-9709-602540f3ca6f
#> 25 2856d06c-0ff9-4e01-bfc9-202b74d0b60f
#> 26 251b1a7e-d050-4486-8d50-4c2619eb0f46
#> 27 07760522-707a-4a1c-8891-dbd1226d6b27
#> 28 9fcb0b73-c734-40a5-be9c-ace7eea401c9
#> 29 1a38e762-2465-418f-b81c-6a4bce261c34
#> 30 f16a8f4d-bc97-43c5-a2f6-bbda952e4c5c
#> 31 94c41723-b2c4-4b59-a49a-64c9b851903e
#> 32 6ceeaa86-9ceb-4582-b390-6d4dd6ff0572
#> 33 9a64bf99-ebe5-4276-93a8-bee9dff1cd47
#> 34 fc0ceb80-d2d9-47c1-9d78-b0e45c64c500
#> 35 d0ea3ec4-0f3b-4649-9146-1c0b5f303a55
#> 36 b8920ef5-7d22-497b-abca-a7a9eb76d79a
#> 37 b1d37bbd-9ae4-4404-b2f9-f2fe66750e4e
#> 38 a4e89c26-e8d4-4471-9b06-16a1405880f0
#> 39 a190b2e9-3796-4785-9a2f-013e2a9a43e6
#> 40 9ff9f9ba-016b-4cbb-8899-45dc20860b8b
#> 41 9940f951-3dc0-4579-bbb2-2392786e59a3
#> 42 74d584f0-74fc-482e-b944-e76f29c1ab85
#> 43 6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
#> 44 6cda07c7-5d7a-41ba-9799-5bb73da25a60
#> 45 646e3e87-e46b-4b12-85b5-8d8589e26088
#> 46 6437bc9c-16cb-46c8-8f79-9a7384a0212a
#> 47 58c43cc2-e00e-43c4-94eb-8501369264e1
#> 48 53bc5729-6202-4351-bc99-1f36139e9dc4
#> 49 44c83972-e5d2-4858-ac58-2df9f4bf564b
#> 50 2ecc72f8-085f-4e86-8692-771f316c54f6
#> 51 2e5a9b5d-d31b-4e9f-a179-d5d70ba459fb
#> 52 1c9f5c6b-73da-4d17-95de-df080ffe0df1
#> 53 100c6145-7b0e-4ba6-81c1-ffebed0d1ac4
#> 54 0ed60482-a34f-4268-b576-d69cc30210f6
#> 55 0eccaf0c-19d2-4900-9962-899378adf8be
#> 56 04c94a7d-1133-42c9-bb48-c697bd302a8d
#> 57 0374f03c-62e2-4859-8a14-acb00b0627d5
#> 58 03181d87-4769-41e7-8c39-d9a81835f0d2
#> 59 f171db61-e57e-4535-a06a-35d8b6ef8f2b
#> 60 ecf2e08e-2032-4a9e-b466-b65b395f4a02
#> 61 74cff64f-9da9-4b2a-9b3b-8a04a1598040
#> 62 5af90777-6760-4003-9dba-8f945fec6fdf
#> 63 bd65a70f-b274-4133-b9dd-0d1431b6af34
#> 64 ff45e623-7f5f-46e3-b47d-56be0341f66b
#> 65 f01bdd17-4902-40f5-86e3-240d66dd2587
#> 66 e6a11140-2545-46bc-929e-da243eed2cae
#> 67 e5c63d94-593c-4338-a489-e1048599e751
#> 68 d8732da6-8d1d-42d9-b625-f2416c30054b
#> 69 d77ec7d6-ef2e-49d6-9e79-05b7f8881484
#> 70 cee11228-9f0b-4e57-afe2-cfe15ee56312
#> 71 a357414d-2042-4eb5-95f0-c58604a18bdd
#> 72 a2d4d33e-4c62-4361-b80a-9be53d2e50e8
#> 73 a0754256-f44b-4c4a-962c-a552e47d3fdc
#> 74 983d5ec9-40e8-4512-9e65-a572a9c486cb
#> 75 7357cee7-9f7f-4ab0-8cec-90de8f047e38
#> 76 6ec405bb-4727-4c6d-ab4e-01fe489af7ea
#> 77 6d41668c-168c-4500-b06a-4674ccf3e19d
#> 78 5e5e7a2f-8f1c-42ac-90dc-b4f80f38e84c
#> 79 55cf0ea3-9d2b-4294-871e-bb4b49a79fc7
#> 80 4f1555bc-4664-46c3-a606-78d34dd10d92
#> 81 2ba40233-8576-4dec-a5f1-2adfa115e2dc
#> 82 2423ce2c-3149-4cca-a2ff-cf682ea29b5f
#> 83 1c9eb291-6d31-47e1-96b2-129b5e1ae64f
#> 84 18eb630b-a754-4111-8cd4-c24ec80aa5ec
#> 85 0d2ee4ac-05ee-40b2-afb6-ebb584caa867
#> dataset_title
#> 1 Human: Great apes study
#> 2 Dissection: Angular gyrus (AnG)
#> 3 Supercluster: CGE-derived interneurons
#> 4 Dissection: Primary auditory cortex(A1)
#> 5 Supercluster: Deep layer (non-IT) excitatory neurons
#> 6 Supercluster: IT-projecting excitatory neurons
#> 7 Dissection: Anterior cingulate cortex (ACC)
#> 8 Human Multiple Cortical Areas SMART-seq
#> 9 Supercluster: MGE-derived interneurons
#> 10 Dissection: Primary somatosensory cortex (S1)
#> 11 Dissection: Primary visual cortex(V1)
#> 12 Dissection: Dorsolateral prefrontal cortex (DFC)
#> 13 Dissection: Primary motor cortex (M1)
#> 14 Supercluster: Non-neuronal cells
#> 15 Dissection: Middle temporal gyrus (MTG)
#> 16 Combined single cell and single nuclei RNA-Seq data - Heart Global
#> 17 Global dataset of infant KMT2Ar B-ALL
#> 18 Normal immune cells landscape of infant KMT2Ar B-ALL
#> 19 Human Human Microglia 10x scRNA-seq
#> 20 Human Endothelial cells 10x scRNA-seq
#> 21 Human Nurr-Negative Nuclei 10x scRNA-seq
#> 22 Human Nurr-Positive Nuclei 10x scRNA-seq
#> 23 Human Oligodendrocytes 10x scRNA-seq
#> 24 Human OPC Cells 10x scRNA-seq
#> 25 Human DA Neurons 10x scRNA-seq
#> 26 Human Non-DA Neurons 10x scRNA-seq
#> 27 Human Astrocytes 10x scRNA-seq
#> 28 An Integrated Single Cell Meta-atlas of Human Periodontitis
#> 29 Single-cell analysis of prenatal and postnatal human cortical development
#> 30 All - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
#> 31 snRNA-seq of human anterior and posterior hippocampus
#> 32 3-prime FGID data
#> 33 Single-Cell RNA Sequencing of Breast Tissues: Cell Subtypes and Cancer Risk Factors
#> 34 Sst Chodl - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 35 L6b - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 36 L5/6 NP - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 37 Sncg - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 38 L6 CT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 39 Lamp5 Lhx6 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 40 L4 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 41 Oligodendrocyte - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 42 Astrocyte - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 43 Whole Taxonomy - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 44 L5 ET - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 45 L2/3 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 46 L6 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 47 OPC - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 48 Vip - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 49 L5 IT - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 50 Endothelial - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 51 VLMC - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 52 L6 IT Car3 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 53 Microglia-PVM - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 54 Lamp5 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 55 Pax6 - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 56 Pvalb - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 57 Chandelier - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 58 Sst - DLPFC: Seattle Alzheimer's Disease Atlas (SEA-AD)
#> 59 donor_p13_trophoblasts
#> 60 All donors trophoblasts
#> 61 All donors all cell states (in vivo)
#> 62 Single-cell transcriptomic datasets of Renal cell carcinoma patients
#> 63 Single-cell sequencing links multiregional immune landscapes and tissue-resident T cells in ccRCC to tumor topology and therapy efficacy
#> 64 Tabula Sapiens - Pancreas
#> 65 Tabula Sapiens - Salivary_Gland
#> 66 Tabula Sapiens - Heart
#> 67 Tabula Sapiens - Bladder
#> 68 Tabula Sapiens - Trachea
#> 69 Tabula Sapiens - Prostate
#> 70 Tabula Sapiens - Spleen
#> 71 Tabula Sapiens - Small_Intestine
#> 72 Tabula Sapiens - Vasculature
#> 73 Tabula Sapiens - Eye
#> 74 Tabula Sapiens - Blood
#> 75 Tabula Sapiens - Large_Intestine
#> 76 Tabula Sapiens - Uterus
#> 77 Tabula Sapiens - Liver
#> 78 Tabula Sapiens - Fat
#> 79 Tabula Sapiens - Tongue
#> 80 Tabula Sapiens - Bone_Marrow
#> 81 Tabula Sapiens - Mammary
#> 82 Tabula Sapiens - Kidney
#> 83 Tabula Sapiens - Muscle
#> 84 Tabula Sapiens - Lymph_Node
#> 85 Tabula Sapiens - Lung
#> dataset_total_cell_count
#> 1 156285
#> 2 110752
#> 3 129495
#> 4 139054
#> 5 92969
#> 6 638941
#> 7 135462
#> 8 49417
#> 9 185477
#> 10 153159
#> 11 241077
#> 12 113339
#> 13 114605
#> 14 108940
#> 15 148374
#> 16 493236
#> 17 128588
#> 18 36313
#> 19 33041
#> 20 14903
#> 21 104097
#> 22 80576
#> 23 178815
#> 24 13691
#> 25 22048
#> 26 91479
#> 27 33506
#> 28 105918
#> 29 700391
#> 30 356213
#> 31 129905
#> 32 89849
#> 33 52681
#> 34 1772
#> 35 17996
#> 36 18154
#> 37 23640
#> 38 27454
#> 39 21603
#> 40 76195
#> 41 136076
#> 42 82936
#> 43 1309414
#> 44 3848
#> 45 317116
#> 46 44174
#> 47 27670
#> 48 95014
#> 49 97173
#> 50 2496
#> 51 4619
#> 52 13007
#> 53 40625
#> 54 52828
#> 55 8984
#> 56 109618
#> 57 14871
#> 58 71545
#> 59 31497
#> 60 67070
#> 61 286326
#> 62 270855
#> 63 167283
#> 64 13497
#> 65 27199
#> 66 11505
#> 67 24583
#> 68 9522
#> 69 16375
#> 70 34004
#> 71 12467
#> 72 16037
#> 73 10650
#> 74 50115
#> 75 13680
#> 76 7124
#> 77 5007
#> 78 20263
#> 79 15020
#> 80 12297
#> 81 11375
#> 82 9641
#> 83 30746
#> 84 53275
#> 85 35682
#> [ reached 'max' / getOption("max.print") -- omitted 566 rows ]
The sum of cell counts across all datasets should match the number of cells across all SOMA experiments (human, mouse).
census_data <- census$get("census_data")
all_experiments <- lapply(census_data$to_list(), function(x) census_data$get(x$name))
print(all_experiments)
#> $homo_sapiens
#> <SOMAExperiment>
#> uri: s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/homo_sapiens
#> arrays: obs*
#> groups: ms*
#>
#> $mus_musculus
#> <SOMAExperiment>
#> uri: s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/census_data/mus_musculus
#> arrays: obs*
#> groups: ms*
experiments_total_cells <- sum(sapply(all_experiments, function(x) {
nrow(x$obs$read(column_names = c("soma_joinid"))$concat())
}))
print(paste("Found", experiments_total_cells, "cells in all experiments."))
#> [1] "Found 68683222 cells in all experiments."
print(paste(
"Found", sum(as.vector(census_datasets$dataset_total_cell_count)),
"cells in all datasets."
))
#> [1] "Found 68683222 cells in all datasets."
Fetching the expression data from a single dataset
Let’s pick one dataset to slice out of the census, and turn into a Seurat in-memory object. (This requires the Seurat
package to have been installed beforehand.)
census_datasets[census_datasets$dataset_id == "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149", ]
#> soma_joinid collection_id collection_name
#> 581 580 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis
#> collection_doi dataset_id
#> 581 10.1038/s41586-020-2496-1 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149
#> dataset_version_id
#> 581 ff352f35-58a2-4962-b716-649d1f9e9f44
#> dataset_title
#> 581 Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
#> dataset_h5ad_path dataset_total_cell_count
#> 581 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad 40220
Create a query on the mouse experiment, “RNA” measurement, for the dataset_id
.
library("tiledbsoma")
obs_query <- SOMAAxisQuery$new(
value_filter = "dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'"
)
expt_query <- census_data$get("mus_musculus")$axis_query(
measurement_name = "RNA",
obs_query = obs_query
)
dataset_seurat <- expt_query$to_seurat(c(counts = "raw"))
print(dataset_seurat)
#> An object of class Seurat
#> 52417 features across 40220 samples within 1 assay
#> Active assay: RNA (52417 features, 0 variable features)
#> 2 layers present: counts, data
#> 1 dimensional reduction calculated: scvi
Downloading the original source H5AD file of a dataset
You can use the cellxgene.census::get_source_h5ad_uri()
API to fetch a URI pointing to the H5AD associated with this dataset_id
. This is the same H5AD you can download from CZ CELLxGENE Discover, and may contain additional data-submitter provided information which was not included in the Census.
To do this you can fetch the location in the cloud or directly download to your system.
# Option 1: Direct download
download_source_h5ad(
dataset_id = "0bd1a1de-3aee-40e0-b2ec-86c7a30c7149",
file = "/tmp/Tabula_Muris_Senis-bone_marrow.h5ad",
overwrite = TRUE
)
# Option 2: Get location and download via preferred method
get_source_h5ad_uri("0bd1a1de-3aee-40e0-b2ec-86c7a30c7149")
#> $uri
#> [1] "s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/h5ads/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad"
#>
#> $s3_region
#> [1] "us-west-2"
The local H5AD file can be used in R using SeuratDisk’s anndata converter.