python: 使用tangram进行空间转录组映射表达量分析

Python
188
0
0
2024-05-09

tangram是一种映射单细胞表达量数据到空间转录组数据的方法,它可以将单细胞中的表达量数据映射到空间转录组的每一个cell中。这对于一些gene panel数量较少的空间转录组技术如Xenium、CosMx等可以起到扩充基因数量的作用,因为tangram基因映射后的客观结果是使得每一个Xenium/CosMx数据集的细胞中的基因panel数量将和使用的单细胞数据集的panel数量保持一致,而单细胞数据集panel数量是可以轻松到2万+的。

tangram的官方教程可见GitHub - Tangram(https://github.com/broadinstitute/Tangram)。

本文的示例数据使用10x Genomics的Xenium官方数据,地址可见https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast。除了Xenium空间转录组数据,还附有连续切片来源的单细胞转录组数据。

所有数据如下:

In Situ Sample 1, Replicate 1 In Situ Sample 1, Replicate 2 In Situ Sample 2 5’ Single Cell 3’ Single Cell FRP Visium Spatial

单步执行

如上述,测试数据使用10x Genomics的Xenium官方数据。

import scanpy as sc
import tangram as tg

xenium = sc.read_10x_mtx("data/Xenium_FFPE_Human_Cancer_Rep1_outs/cell_feature_matrix")
adata  = sc.read_10x_h5('data/SC3pv3_GEX_Breast_Cancer_DTC_Aggr_count_filtered_feature_bc_matrix.h5')

xenium.var_names_make_unique()
adata.var_names_make_unique()

使用scanpy进行必要的预处理和降维聚类处理:

# filter
sc.pp.filter_cells(adata, min_genes=3)
sc.pp.filter_genes(adata, min_cells=3)

sc.pp.filter_cells(xenium, min_genes=3)
sc.pp.filter_genes(xenium, min_cells=3)

# normalization
adata_raw = adata
sc.pp.normalize_total(adata, target_sum = 1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value  =10)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.leiden(adata, resolution = 0.5)
adata_raw.obs['leiden'] = adata.obs.leiden

tangram的数据映射分为三步:

  1. pp_adatas 返回值是一个AnnData对象。寻找单细胞数据和空间转录组数据之间的共有基因,保存至结果对象uns中。另外,此函数有genes参数,可以接受用于训练的基因。
  2. map_cells_to_space 返回的 AnnData,它是一个 cell-by-voxel 的结构,可以给出 cell i位于 voxel j 中的概率。voxel可以理解为是空间转录组数据中的细胞。
  3. project_genes project_genes可以进一步处理map_cells_to_space函数的计算结果,其结果是voxel-by-gene结构,类似于空间转录组数据,但是其基因panel和表达量是来源于单细胞数据集。

另外,tangram支持两种模式:cell和cluster模式,cluster可以很大的节省计算时间和计算空间(内存)。他们的区别就是在map_cells_to_space和project_genes函数设置上,需要将mode和cluster_label设置好。

cell模式

tg.pp_adatas(adata_raw, xenium, genes=None) # genes = None, using all genes
ad_map = tg.map_cells_to_space(adata_raw,xenium)
ad_ge = tg.project_genes(ad_map, adata_raw)

cluster模式

tg.pp_adatas(adata_raw, xenium, genes=None) # genes = None, using all genes
ad_map = tg.map_cells_to_space(
  adata_raw,
  xenium, 
  device = 'cpu', 
  mode   = 'clusters', # 'cell', 'clusters', 'constrained'
  cluster_label = 'leiden', 
  density_prior = 'rna_count_based' # 'rna_count_based' or 'uniform', can also be a ndarray
)
ad_ge = tg.project_genes(ad_map, adata_raw, cluster_label = 'leiden')

ad_ge就是映射了单细胞转录数据的空间转录组数据。

批量模式

可以将各种需要的参数都设置好,并改写为python函数,以便于复用。

根据文献‘Benchmarking spatial and single-cell transcriptomics integration methods for transcript distribution prediction and cell type deconvolution’的描述,tangram适合raw-raw的模式:

raw expression matrix of spatial data and raw expression matrix of scRNA-seq data (R-R);

因此此函数除支持cluster模式之外,还支持是否对单细胞和空间转录组数据进行Normalize处理的选项,此外也可以自定义trainning gene。

函数run_tangram的定义如下:

import scanpy as sc
import tangram as tg
import pandas as pd

# script to run tangram
# according to paper of QUNKUNlab, raw-raw is recommended
def run_tangram(
    singlecell: sc.AnnData,
    sp: sc.AnnData,
    trainning_gene=None,
    predicted_gene=None,
    do_norm: bool = False,
    do_log: bool = False,
    device: str = "cpu",
    mode: str = 'clusters',
    cluster_label: str = 'leiden',
    resolution: int = 0.5,
    density_prior: str = 'rna_count_based'
):
    """
    Function to run tangram.

    Args:
        do_norm: whether to do normalize, default is False
        do_log:  whether to do log1p, default is False
        trainning_gene: genes using to trainning model
        predicted_gene: genes to predict, if not none,  only return dataset of predicted_gene

    Returns:
        a pandas.DataFrame, row is cells and column is gene
    """

    singlecell = singlecell.copy()
    sp = sp.copy()

    # norm
    if do_norm:
        sc.pp.normalize_total(singlecell, target_sum=1e4)
        sc.pp.normalize_total(sp, target_sum=1e4)
    if do_log:
        sc.pp.log1p(singlecell)
        sc.pp.log1p(sp)

    # saving gene name, gene will changed to lower-case after running pp_adatas
    gene_name = singlecell.var_names

    # genes = None, using all genes
    tg.pp_adatas(singlecell, sp, genes=trainning_gene)

    if mode == 'clusters':
        sc_raw = singlecell
        sc.pp.normalize_total(singlecell, target_sum=1e4)
        sc.pp.log1p(singlecell)
        sc.pp.highly_variable_genes(singlecell)
        singlecell = singlecell[:, singlecell.var.highly_variable]
        sc.pp.scale(singlecell, max_value=10)
        sc.tl.pca(singlecell)
        sc.pp.neighbors(singlecell)
        sc.tl.leiden(singlecell, resolution=resolution)
        sc_raw.obs[cluster_label] = singlecell.obs.leiden

        ad_map = tg.map_cells_to_space(
            sc_raw,
            sp,
            device=device,
            mode=mode,  # 'cell', 'clusters', 'constrained'
            cluster_label=cluster_label,
            density_prior=density_prior  # 'rna_count_based' or 'uniform', can also be a ndarray
        )
        ad_ge = tg.project_genes(ad_map, sc_raw, cluster_label=cluster_label)
    else:
        ad_map = tg.map_cells_to_space(singlecell, sp, device=device)
        ad_ge = tg.project_genes(ad_map, singlecell)

    assert all(ad_ge.var_names.str.lower() == gene_name.str.lower())

    predicted_dat = pd.DataFrame(
        ad_ge.X, index=ad_ge.obs_names, columns=gene_name)


    if not predicted_gene is None:
        predicted_dat = predicted_dat.loc[:, predicted_gene]

    return(predicted_dat)

使用示例:

# running analysis
import os
import random
import numpy as np
import pandas as pd
import scanpy as sc


dir_name = "geneImputation_output"

assert os.path.exists(dir_name)
assert os.path.exists('data')


xenium = sc.read_10x_mtx("data/Xenium_FFPE_Human_Cancer_Rep1_outs/cell_feature_matrix")
adata  = sc.read_10x_h5('data/SC3pv3_GEX_Breast_Cancer_DTC_Aggr_count_filtered_feature_bc_matrix.h5')

xenium.var_names_make_unique()
adata.var_names_make_unique()


# filter
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=100)

sc.pp.filter_cells(xenium, min_genes=10)  # cell count bigger than 250M
sc.pp.filter_genes(xenium, min_cells=10)  # only 280+ genes


# trainning gene
all_common_genes = sorted(list(set(xenium.var_names) & set(adata.var_names)))
assert len(all_common_genes) > 0

train_ratio = 2/3
train_genes_count = int(train_ratio * len(all_common_genes))
assert train_genes_count > 0

random.seed(1234)
trainning_gene = random.sample(all_common_genes, train_genes_count)
testing_gene = list(set(all_common_genes) - set(trainning_gene))


# subset xenium
if not all(xenium.var_names.isin(all_common_genes)):
    print(f"Some genes ({len(set(xenium.var_names) - set(all_common_genes))}) not existed in singlecell, woule be discared!", flush=True)
    xenium = xenium[:, xenium.var_names.isin(all_common_genes)]

# analysis
for norm in ['raw', 'norm']:
    # 是否norm
    if norm == "raw":
        norm_used = False
    else:
        norm_used = True

    # testing_gene
    if not norm_used:
        testing_gene_used = None
    else:
        testing_gene_used = testing_gene

    # run tangram
    dat_tg = run_tangram(
        adata,
        xenium, 
        do_norm=norm_used,
        do_log=norm_used,
        mode='clusters',
        trainning_gene=trainning_gene, 
        predicted_gene=testing_gene_used
    )
    if not norm_used:
        # 直接导出数据
        save_csv(dat_tg.loc[: , testing_gene] , 'tangram_in_' + str(norm) + '.csv', dir_name=dir_name)
   else:
        save_csv(dat_tg, 'tangram_in_' + str(norm) + '.csv', dir_name=dir_name)

结果产出是一个只有测试基因的单细胞推断后的空间转录组矩阵数据,下游的预测和真实值的对比分析、可视化分析,个人的习惯还是在R里面进行,都是常规代码,这里就不再赘述了。