Integrate scRNA-seq datasets#

import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
import anndata as ad

✅ loaded instance: testuser1/test-scrna (lamindb 0.50.2)

ln.track()

💡 notebook imports: anndata==0.9.2 lamindb==0.50.2 lnschema_bionty==0.29.2 pandas==1.5.3

🌱 saved: Transform(id='agayZTonayqAz8', name='Integrate scRNA-seq datasets', short_name='scrna2', stem_id='agayZTonayqA', version='0', type=notebook, updated_at=2023-08-11 19:25:02, created_by_id='DzTjkKse')

🌱 saved: Run(id='jMRFVuQK1zT6V5OI8GdS', run_at=2023-08-11 19:25:02, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')

Query files based on metadata#

ln.File.filter(tissues__name__icontains="lymph node").distinct().df()

	storage_id	key	suffix	accessor	description	version	initial_version_id	size	hash	hash_type	transform_id	run_id	updated_at	created_by_id
id
ua6stqKNCi1IieUcxLKj	GYAIYq9e	None	.h5ad	AnnData	Detmar22	None	None	17342743	rk5lSoJvz6PHRRjmcB919w	md5	Nv48yAceNSh8z8	lEknOLMxVV6gR8QPOxxr	2023-08-11 19:24:32	DzTjkKse
U2P0q1ZTnxMbwTHg9qY2	GYAIYq9e	None	.h5ad	AnnData	Conde22	None	None	28061905	3cIcmoqp1MxjX8NlRkKGlQ	md5	Nv48yAceNSh8z8	lEknOLMxVV6gR8QPOxxr	2023-08-11 19:24:48	DzTjkKse

ln.File.filter(cell_types__name__icontains="monocyte").distinct().df()

	storage_id	key	suffix	accessor	description	version	initial_version_id	size	hash	hash_type	transform_id	run_id	updated_at	created_by_id
id
U2P0q1ZTnxMbwTHg9qY2	GYAIYq9e	None	.h5ad	AnnData	Conde22	None	None	28061905	3cIcmoqp1MxjX8NlRkKGlQ	md5	Nv48yAceNSh8z8	lEknOLMxVV6gR8QPOxxr	2023-08-11 19:24:48	DzTjkKse
IHo3Sx7zrOGyJnQBgJhd	GYAIYq9e	None	.h5ad	AnnData	10x reference pbmc68k	None	None	589484	eKVXV5okt5YRYjySMTKGEw	md5	Nv48yAceNSh8z8	lEknOLMxVV6gR8QPOxxr	2023-08-11 19:24:55	DzTjkKse

ln.File.filter(labels__name="female").distinct().df()

	storage_id	key	suffix	accessor	description	version	initial_version_id	size	hash	hash_type	transform_id	run_id	updated_at	created_by_id
id
ua6stqKNCi1IieUcxLKj	GYAIYq9e	None	.h5ad	AnnData	Detmar22	None	None	17342743	rk5lSoJvz6PHRRjmcB919w	md5	Nv48yAceNSh8z8	lEknOLMxVV6gR8QPOxxr	2023-08-11 19:24:32	DzTjkKse

Intersect measured genes between two datasets#

file1 = ln.File.filter(description="Conde22").one()
file2 = ln.File.filter(description="10x reference pbmc68k").one()

file1.describe()

💡 File(id=U2P0q1ZTnxMbwTHg9qY2, key=None, suffix=.h5ad, accessor=AnnData, description=Conde22, version=None, size=28061905, hash=3cIcmoqp1MxjX8NlRkKGlQ, hash_type=md5, created_at=2023-08-11 19:24:48.671223+00:00, updated_at=2023-08-11 19:24:48.671244+00:00)

Provenance:
    🗃️ storage: Storage(id='GYAIYq9e', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-08-11 19:25:00, created_by_id='DzTjkKse')
    📎 initial_version: None
    📔 transform: Transform(id='Nv48yAceNSh8z8', name='Curate & link scRNA-seq datasets', short_name='scrna', stem_id='Nv48yAceNSh8', version='0', type='notebook', updated_at=2023-08-11 19:24:55, created_by_id='DzTjkKse')
    🚗 run: Run(id='lEknOLMxVV6gR8QPOxxr', run_at=2023-08-11 19:24:18, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
    👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-08-11 19:25:00)
Features:
  🗺️ var (X):
    🔗 index (36503, bionty.Gene.id): ['i4dj2esHIMJ6', 'a7NCg0YCetMQ', 'pvRCh0DOHgyg', 'aUs4QKHdMUG6', 'wCm49B6H3fTG'...]
  🗺️ external:
    🔗 species (1, bionty.Species): ['human']
  🗺️ obs (metadata):
    🔗 cell_type (32, bionty.CellType): ['memory B cell', 'CD4-positive helper T cell', 'germinal center B cell', 'CD16-negative, CD56-bright natural killer cell, human', 'CD8-positive, alpha-beta memory T cell']
    🔗 assay (3, bionty.ExperimentalFactor): ["10x 5' v1", "10x 3' v3", "10x 5' v2"]
    🔗 tissue (17, bionty.Tissue): ['blood', 'ileum', 'thymus', 'transverse colon', 'lamina propria']
    🔗 donor (12, core.Label): ['A37', 'A31', 'A52', '637C', '582C']

file1.view_lineage()

https://d33wubrfki0l68.cloudfront.net/e89ac7aadb0e0cee61c7d0c8b8b6721ec5e7f24b/9a64e/_images/423a7cef7afc3f09e33fdb88f555aed66a92ac656ce8ac7885270b56d500b345.svg

file2.describe()

💡 File(id=IHo3Sx7zrOGyJnQBgJhd, key=None, suffix=.h5ad, accessor=AnnData, description=10x reference pbmc68k, version=None, size=589484, hash=eKVXV5okt5YRYjySMTKGEw, hash_type=md5, created_at=2023-08-11 19:24:55.560302+00:00, updated_at=2023-08-11 19:24:55.560322+00:00)

Provenance:
    🗃️ storage: Storage(id='GYAIYq9e', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-08-11 19:25:00, created_by_id='DzTjkKse')
    📎 initial_version: None
    📔 transform: Transform(id='Nv48yAceNSh8z8', name='Curate & link scRNA-seq datasets', short_name='scrna', stem_id='Nv48yAceNSh8', version='0', type='notebook', updated_at=2023-08-11 19:24:55, created_by_id='DzTjkKse')
    🚗 run: Run(id='lEknOLMxVV6gR8QPOxxr', run_at=2023-08-11 19:24:18, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
    👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-08-11 19:25:00)
Features:
  🗺️ var (X):
    🔗 index (695, bionty.Gene.id): ['yP4OX033LSYh', 'a7NCg0YCetMQ', 'JGlkDooUtNDt', 'pN8nEASfAYmA', '3NeDgixHNurG'...]
  🗺️ obs (metadata):
    🔗 cell_type (9, bionty.CellType): ['dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'conventional dendritic cell', 'CD14-positive, CD16-negative classical monocyte', 'CD16-positive, CD56-dim natural killer cell, human']

file2.view_lineage()

https://d33wubrfki0l68.cloudfront.net/6a366fa5d8da49444d589c0e747176ef013a3444/a24eb/_images/702b027bcfd8c53df71c972cdd7bd4d621ef96b66b253f3ab4d46cad975c6a5a.svg

file1_adata = file1.load()
file2_adata = file2.load()

💡 adding file U2P0q1ZTnxMbwTHg9qY2 as input for run jMRFVuQK1zT6V5OI8GdS, adding parent transform Nv48yAceNSh8z8

💡 adding file IHo3Sx7zrOGyJnQBgJhd as input for run jMRFVuQK1zT6V5OI8GdS, adding parent transform Nv48yAceNSh8z8

file2_adata.obs.cell_type.head()

index
GCAGGGCTGGATTC-1                                       dendritic cell
CTTTAGTGGTTACG-6                                B cell, CD19-positive
TGACTGGAACCATG-7                                       dendritic cell
TCAATCACCCTTCG-8                                B cell, CD19-positive
CGTTATACAGTACC-8    effector memory CD4-positive, alpha-beta T cel...
Name: cell_type, dtype: category
Categories (9, object): ['CD8-positive, CD25-positive, alpha-beta regul..., 'effector memory CD4-positive, alpha-beta T ce..., 'cytotoxic T cell', 'CD38-negative naive B cell', ..., 'B cell, CD19-positive', 'conventional dendritic cell', 'CD16-positive, CD56-dim natural killer cell, ..., 'dendritic cell']

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
shared_genes.list("symbol")[:10]

['CHCHD10',
 'FUOM',
 'C16orf74',
 'MCM7',
 'NEDD8',
 'OGG1',
 'FAM185A',
 'GABARAPL2',
 'IL2RB',
 'XCL1']

We also need to convert the ensembl_gene_id to symbol for file2 so that they can be concatenated:

mapper = (
    pd.DataFrame(file2_genes.values_list("ensembl_gene_id", "symbol"))
    .drop_duplicates(0)
    .set_index(0)[1]
)
mapper.head()

0
ENSG00000198900       TOP1
ENSG00000250479    CHCHD10
ENSG00000117691       NENF
ENSG00000185088     RPS27L
ENSG00000107485      GATA3
Name: 1, dtype: object

file1_adata.var.rename(index=mapper, inplace=True)

Intersect cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

['conventional dendritic cell',
 'CD16-positive, CD56-dim natural killer cell, human']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file1_adata_subset.obs["cell_type"].value_counts()

CD16-positive, CD56-dim natural killer cell, human    114
conventional dendritic cell                             7
Name: cell_type, dtype: int64

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset.obs["cell_type"].value_counts()

CD16-positive, CD56-dim natural killer cell, human    3
conventional dendritic cell                           2
Name: cell_type, dtype: int64

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

AnnData object with n_obs × n_vars = 126 × 695
    obs: 'cell_type', 'file'
    obsm: 'X_umap'

adata_concat.obs.value_counts()

cell_type                                           file                 
CD16-positive, CD56-dim natural killer cell, human  Conde22                  114
conventional dendritic cell                         Conde22                    7
CD16-positive, CD56-dim natural killer cell, human  10x reference pbmc68k      3
conventional dendritic cell                         10x reference pbmc68k      2
dtype: int64