只保留编码蛋白的基因,HgProteinCodingGenes是人的编码蛋白的基因,此外也提供了小鼠的编码基因列表,其他物种可以根据ensembl网站的基因组注释文件gtf文件获取编码蛋白的基因列表; 输入可以是Seurat对象或者SingleCellExperiment对象;数据需要标准化处理 使用Multiple Correspondence Analysis (MCA)方法降维,类似PCA 从PanglaoDB下载所有组织的细胞类型, 从中选择pancreas组织的细胞类型marker基因 panglao_pancreas是数据框,需要转换为列表,名字是细胞类型,值为基因,CelliD的输入格式; all_gs是列表,名字是细胞类型,值为基因;删除基因个数少于10的细胞类型 gs是稀疏矩阵,行是细胞类型,列是细胞,值是-log10转换后校正的p值安装需要用到的R包
install.packages("pak")
c("Seurat","tidyverse","ggpubr","magrittr","data.table","ggpubr","CelliD") %>% pak::pkg_install()using
是我写的函数,作用是一次性加载多个R包,不用写双引号,并且不在屏幕上打印包的加载信息,可以参考之前的推文《如何优雅地管理R包》using的定义;函数名字using是在模仿Julia语言中的包加载函数using(Seurat,tidyverse,data.table,ggpubr,CelliD,ggpubr,magrittr)
# SeuratObject存贮为v5版本
options(Seurat.object.assay.version = "v5")读取表达量矩阵和样本信息
Matrix <- readRDS("BaronMatrix.rds")
MetaData <- readRDS("BaronMetaData.rds")
data("HgProteinCodingGenes")
Matrix <- Matrix[rownames(Matrix) %in% HgProteinCodingGenes,]表达数据处理
scobj <- CreateSeuratObject(counts = Matrix, project = "scobj", min.cells = 5, meta.data = MetaData)
Seurat::DefaultAssay(scobj) <- "RNA"
scobj[["RNA"]] <- split(scobj[["RNA"]], f = scobj$donor)
scobj %<>% Seurat::NormalizeData()
scobj %<>% Seurat::FindVariableFeatures()
scobj %<>% Seurat::ScaleData()
scobj %<>% Seurat::RunPCA()
scobj %<>% Seurat::IntegrateLayers(
method = HarmonyIntegration,
orig.reduction = "pca",
new.reduction = "harmony",
verbose = FALSE
)
scobj %<>% Seurat::FindNeighbors(reduction = "harmony", dims = 1:30)
scobj %<>% Seurat::FindClusters(resolution = 1.0, cluster.name = "Cluster")
scobj[["RNA"]] <- JoinLayers(scobj[["RNA"]])降维
scobj %<>% CelliD::RunMCA()
使用自定义基因列表注释细胞类型
panglao <- readr::read_tsv("PanglaoDB_markers_27_Mar_2020.tsv.gz") %>%
dplyr::filter(str_detect(species,"Hs"))panglao_pancreas <- panglao %>%
dplyr::filter(organ == "Pancreas")panglao_pancreas %<>%
dplyr::group_by(`cell type`) %>%
dplyr::summarise(geneset = list(`official gene symbol`))
pancreas_gs <- setNames(panglao_pancreas$geneset, panglao_pancreas$`cell type`)all_gs <- all_gs[purrr::map_lgl(all_gs, ~length(.x) >= 10) ]
预测细胞类型
gs <- CelliD::RunCellHGT(scobj, pathways = pancreas_gs, dims = 1:50, n.features = 200,log.trans= TRUE,p.adjust = TRUE)
gs_prediction <- rownames(gs)[apply(gs, 2, which.max)]
# 预测的阈值P设置为0.01,小于阈值则使用该预测结果,如果大于该阈值则为unassigned未分类
scobj$pancreas_gs_prediction <- ifelse(apply(gs, 2, max) > -log10(0.01), yes = gs_prediction, "unassigned")UMAP展示预测结果
scobj %<>% Seurat::RunUMAP(dims = 1:30, reduction = "harmony")
Seurat::DimPlot(scobj, reduction = "umap", group.by = "Cluster")+
Seurat::DimPlot(scobj, reduction = "umap", group.by = "pancreas_gs_prediction")+
Seurat::DimPlot(scobj, reduction = "umap", group.by = "cell.type")
联系客服